In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgboost
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from model_wrapper import (
    SKLearnModelWrapper,
    ModelWrapper
)
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

In [49]:
df = pd.read_csv("coulomb_full_train.csv", index_col = 0)

In [50]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [51]:
train_cols = [col for col in df.columns if col != 'smile' and col != 'gap']
y_train = df['gap']

In [52]:
x_train = df[train_cols]

In [107]:
# df_new = x_train
# df_new['gap'] = y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [117]:
# df_new.to_csv("new_df.csv")

In [109]:
from sklearn.ensemble import GradientBoostingRegressor

class GradientBoost(SKLearnModelWrapper):
    def get_model(self):
        return GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=0, loss='ls')

In [110]:
class BoostRegression(SKLearnModelWrapper):
    def get_model(self):
        return xgboost.XGBRegressor(n_estimators=100, max_depth=7)

In [118]:
boosted = BoostRegression("new_df.csv")
boosted.build_model()

ASDASD
0.0367822316913 ERROR


In [121]:
type(4) is int

True

In [131]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

from sklearn.model_selection import GridSearchCV

class ModelWrapper:
    """
    Wrapper to simplify the process of
    loading data, parameter tuning, etc
    """
    def __init__(self, csv_name, var_name, *args, debug=False, **kwargs):
        df = pd.read_csv(csv_name)
        df = df[[col for col in df.columns if type(col) is float]]
        
        self.X = self.preprocess_X(df.iloc[:, :-1])
        
        # when debugging only use a tiny subset of the data
        if debug:
            self.X = self.X[:10]

        self.Y = df.iloc[:,-1]
        
    def preprocess_X(self, X):
        return X

    def predict(self, X):
        """
        Return a list of predictions
        and error associated with the model
        """
        raise NotImplemented
    
    def train(self, X, Y):
        """
        Train on a subset of the data, returning
        a model
        """
        raise NotImplemented

    def build_model(self):
        """
        Builds a model
        """
        X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y)
        self.train(X_train, Y_train)
        Y_pred = self.predict(X_test)
        print (mean_squared_error(Y_pred, Y_test), "ERROR")


class SKLearnModelWrapper(ModelWrapper):
    """
    Specifically built for SKLearn models
    """
    def get_model(self):
        """
        Return the SKLearn model class
        """
        raise NotImplemented

    def train(self, X, Y):
        self.model = self.get_model()
        assert (self.model is not None)
        self.model.fit(X, Y)

    def predict(self, X):
        return self.model.predict(X)


class CombinedModelWrapper(ModelWrapper):
    """
    Combines the predictions for several models
    MAKE SURE THAT THE DATAFRAMES ARE ORDERED
    THE SAME WAY!!
    """
    def __init__(self, models, *args, **kwargs):
        self.models = models
        self.X = list(zip(model.X for model in models))
        self.Y = models[0].Y[:len(self.X)]
        
    def apply_models(self, X):
        return [
            [
                model.predict(x)
                for model, x in zip(self.models, dataset)
            ]
            for dataset in X
        ]

    def train(self, X, Y):
        X_transformed = self.apply_models(X)
        return super(CombinedModelWrapper, self).train(X_transformed, Y)

    def predict(self, X):
        X_transformed = self.apply_models(X)
        return super(CombinedModelWrapper, self).predict(X_transformed)

    def build_model(self):
        for model in self.models:
            model.build_model()

        return super(CombinedModelWrapper, self).build_model()

In [None]:
boosted = BoostRegression("coulomb_full_train.csv", 'gap')
boosted.build_model()