This repository was archived by the owner on Aug 25, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 136
This repository was archived by the owner on Aug 25, 2024. It is now read-only.
model: xgboost: New model #701
Copy link
Copy link
Closed
Labels
enhancementNew feature or requestNew feature or requestgood first issueGood for newcomersGood for newcomerskind/mlIssues partaining to machine learningIssues partaining to machine learningp3Average PriorityAverage PrioritytSEsitmated Time To Complete: ShortEsitmated Time To Complete: Short
Milestone
Description
A first pass at this is below. If you end up using it, applicable Co-authored-by's are as follows
Co-authored-by: John Andersen <johnandersenpdx@gmail.com>
Co-authored-by: Soren Andersen <sorenpdx@gmail.com>
import pathlib
import joblib
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from dffml import *
# TODO Add parameters you want to have access to within self.config here
# For example, search for n_estimators to see how that works
@config
class XDGRegressorModelConfig:
directory: pathlib.Path
features: Features
predict: Feature
n_estimators: int = 1000
class XDGRegressorModel(SimpleModel):
CONFIG = XDGRegressorModelConfig
def __init__(self, config) -> None:
super().__init__(config)
# The saved model
self.saved = None
self.saved_filepath = pathlib.Path(self.config.directory, "model.joblib")
# Load saved model if it exists
if self.saved_filepath.is_file():
self.saved = joblib.load(str(self.saved_filepath))
async def train(self, sources: Sources) -> None:
# Get data into memory
data = pd.DataFrame.from_records(
[
record.features()
async for record in sources.with_features(
self.config.features.names() + [self.config.predict.name]
)
]
)
# Select subset of predictors
x_data = data[self.config.features.names()]
# Select target
y_data = getattr(data, self.config.predict.name)
# XGBoost is a the leading software library for working with standard tabular data (the type of data you store in Pandas DataFrames,
# as opposed to more exotic types of data like images and videos). With careful parameter tuning, you can train highly accurate models.
# Parameters for xgboost
# n_estimators = 100-1000 range,
# learning_rate - In general, a small learning rate and large number of estimators will yield more accurate XGBoost models
# e.g. learning_rate=0.1
# n_jobs - specify number of cores to run in parallel
# my_model = XGBRegressor()
# my_model = XGBRegressor(n_estimators=1000)
# TODO Tweak this?
self.saved = XGBRegressor(
n_estimators=self.config.n_estimators, learning_rate=0.05
)
# my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4) *NOTE: n_jobs made it slightly worse
# TODO Tweak this?
self.saved.fit(
x_data, y_data, verbose=False,
)
# Save the trained model
joblib.dump(self.saved, str(self.saved_filepath))
async def accuracy(self, sources: Sources) -> Accuracy:
predictions = []
acctuals = []
# Make predictions
async for record in self.predict(
sources.with_features(
self.config.features.names() + [self.config.predict.name]
)
):
# Appends prediction and accutal to respective lists
predictions.append(record.prediction(self.config.predict.name).value)
acctuals.append(record.feature(self.config.predict.name))
# Calculate MAE
return mean_absolute_error(predictions, acctuals)
async def predict(self, records):
# Grab records and input data (X data)
saved_records = []
input_data = []
async for record in records:
saved_records.append(record)
input_data.append(record.features(self.config.features.names()))
# Make predictions
predictions = self.saved.predict(pd.DataFrame.from_records(input_data))
# Update records and yield them to caller
for record, prediction in zip(saved_records, predictions):
record.predicted(self.config.predict.name, prediction, float("nan"))
yield recordMetadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or requestgood first issueGood for newcomersGood for newcomerskind/mlIssues partaining to machine learningIssues partaining to machine learningp3Average PriorityAverage PrioritytSEsitmated Time To Complete: ShortEsitmated Time To Complete: Short