In [22]:
from sklearn.ensemble import GradientBoostingRegressor

In [87]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

class ModelWrapper:
    """
    Wrapper to simplify the process of
    loading data, parameter tuning, etc
    """
    def __init__(self, csv_name, *args, debug=False, **kwargs):
        df = pd.read_csv(csv_name)
        self.X = self.preprocess_X(df.iloc[:, :-1])
        
        # when debugging only use a tiny subset of the data
        if debug:
            self.X = self.X[:10]

        self.Y = df.iloc[:,-1]
        
    def preprocess_X(self, X):
        return X

    def predict(self, X):
        """
        Return a list of predictions
        and error associated with the model
        """
        raise NotImplemented
    
    def train(self, X, Y):
        """
        Train on a subset of the data, returning
        a model
        """
        raise NotImplemented

    def build_model(self):
        """
        Builds a model
        """
        X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y)
        self.train(X_train, Y_train)
        Y_pred = self.predict(X_test)
        print (mean_squared_error(Y_pred, Y_test), "ERROR")


class SKLearnModelWrapper(ModelWrapper):
    """
    Specifically built for SKLearn models
    """
    def get_model(self):
        """
        Return the SKLearn model class
        """
        raise NotImplemented

    def train(self, X, Y):
        self.model = self.get_model()
        assert (self.model is not None)
        self.model.fit(X, Y)

    def predict(self, X):
        return self.model.predict(X)


class CombinedModelWrapper(ModelWrapper):
    """
    Combines the predictions for several models
    MAKE SURE THAT THE DATAFRAMES ARE ORDERED
    THE SAME WAY!!
    """
    def __init__(self, models, *args, **kwargs):
        self.models = models
        self.X = list(zip(model.X for model in models))
        print (self.X, "THE XX", np.shape(self.X))
        self.Y = models[0].Y[:len(self.X)]
        
    def apply_models(self, X):
        return [
            [
                model.predict(x)
                for model, x in zip(self.models, dataset)
            ]
            for dataset in X
        ]

    def train(self, X, Y):
        X_transformed = self.apply_models(X)
        return super(CombinedModelWrapper, self).train(X_transformed, Y)

    def predict(self, X):
        X_transformed = self.apply_models(X)
        return super(CombinedModelWrapper, self).predict(X_transformed)

    def build_model(self):
        for model in self.models:
            model.build_model()

        return super(CombinedModelWrapper, self).build_model()

In [88]:
# from model_wrapper import CombinedModelWrapper
# from model_wrapper import SKLearnModelWrapper

In [89]:
class GradientBoost(SKLearnModelWrapper):
    def get_model(self):
        return GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=0, loss='ls')

In [90]:
coulomb_boost = GradientBoost("data/coulomb_features.csv", debug=True)
adjacency_boost = GradientBoost("data/adjacency_features.csv", debug=True)

In [91]:
class CombinedBoost(CombinedModelWrapper, SKLearnModelWrapper):
    def get_model(self):
        return GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=0, loss='ls')

In [92]:
combined_boost = CombinedBoost([coulomb_boost, adjacency_boost], debug=True)

[(   Unnamed: 0            0           1           2           3           4  \
0           0  2369.249756  471.394089  379.020768  362.656456  123.887716   
1           1   323.479177  275.202634  135.462538   79.855681   62.962204   
2           2   451.625863  369.881915  287.156201  124.146651   85.088582   
3           3   329.352269  268.817810  149.091524   90.978229   69.496204   
4           4   412.512190  180.305274  102.637158   81.771576   72.008124   
5           5   411.522206  295.886781  142.758587   85.477563   72.969304   
6           6   403.965231  155.097606   76.964561   67.482997   60.398252   
7           7  2377.902790  435.518733  369.542023  133.069321   79.643201   
8           8   174.873804  103.608836   78.643335   63.071334   50.897042   
9           9   405.898100  156.376970   76.262067   68.597427   60.322206   

           5          6          7          8 ...   190  191  192  193  194  \
0  93.281489  64.132983  52.998261  51.351622 ...   0.0  0.0

In [93]:
combined_boost.build_model()

(10, 201) (10000,)


ValueError: Found input variables with inconsistent numbers of samples: [10, 10000]