Skip to content
This repository was archived by the owner on Aug 25, 2024. It is now read-only.
This repository was archived by the owner on Aug 25, 2024. It is now read-only.

model: xgboost: New model #701

@johnandersen777

Description

@johnandersen777

A first pass at this is below. If you end up using it, applicable Co-authored-by's are as follows

Co-authored-by: John Andersen <johnandersenpdx@gmail.com>
Co-authored-by: Soren Andersen <sorenpdx@gmail.com>
import pathlib

import joblib
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

from dffml import *


# TODO Add parameters you want to have access to within self.config here
# For example, search for n_estimators to see how that works
@config
class XDGRegressorModelConfig:
    directory: pathlib.Path
    features: Features
    predict: Feature
    n_estimators: int = 1000


class XDGRegressorModel(SimpleModel):
    CONFIG = XDGRegressorModelConfig

    def __init__(self, config) -> None:
        super().__init__(config)
        # The saved model
        self.saved = None
        self.saved_filepath = pathlib.Path(self.config.directory, "model.joblib")
        # Load saved model if it exists
        if self.saved_filepath.is_file():
            self.saved = joblib.load(str(self.saved_filepath))

    async def train(self, sources: Sources) -> None:
        # Get data into memory
        data = pd.DataFrame.from_records(
            [
                record.features()
                async for record in sources.with_features(
                    self.config.features.names() + [self.config.predict.name]
                )
            ]
        )
        # Select subset of predictors
        x_data = data[self.config.features.names()]
        # Select target
        y_data = getattr(data, self.config.predict.name)
        # XGBoost is a the leading software library for working with standard tabular data (the type of data you store in Pandas DataFrames,
        # as opposed to more exotic types of data like images and videos). With careful parameter tuning, you can train highly accurate models.
        # Parameters for xgboost
        #   n_estimators = 100-1000 range,
        #   learning_rate - In general, a small learning rate and large number of estimators will yield more accurate XGBoost models
        #       e.g. learning_rate=0.1
        #   n_jobs - specify number of cores to run in parallel
        # my_model = XGBRegressor()
        # my_model = XGBRegressor(n_estimators=1000)

        # TODO Tweak this?
        self.saved = XGBRegressor(
            n_estimators=self.config.n_estimators, learning_rate=0.05
        )
        # my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4) *NOTE:  n_jobs made it slightly worse

        # TODO Tweak this?
        self.saved.fit(
            x_data, y_data, verbose=False,
        )

        # Save the trained model
        joblib.dump(self.saved, str(self.saved_filepath))

    async def accuracy(self, sources: Sources) -> Accuracy:
        predictions = []
        acctuals = []
        # Make predictions
        async for record in self.predict(
            sources.with_features(
                self.config.features.names() + [self.config.predict.name]
            )
        ):
            # Appends prediction and accutal to respective lists
            predictions.append(record.prediction(self.config.predict.name).value)
            acctuals.append(record.feature(self.config.predict.name))
        # Calculate MAE
        return mean_absolute_error(predictions, acctuals)

    async def predict(self, records):
        # Grab records and input data (X data)
        saved_records = []
        input_data = []
        async for record in records:
            saved_records.append(record)
            input_data.append(record.features(self.config.features.names()))
        # Make predictions
        predictions = self.saved.predict(pd.DataFrame.from_records(input_data))
        # Update records and yield them to caller
        for record, prediction in zip(saved_records, predictions):
            record.predicted(self.config.predict.name, prediction, float("nan"))
            yield record

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or requestgood first issueGood for newcomerskind/mlIssues partaining to machine learningp3Average PrioritytSEsitmated Time To Complete: Short

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions