In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as Lin_Reg
from sklearn.metrics import mean_squared_error
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import scipy as sp
from sklearn.linear_model import RidgeCV, Ridge, LassoCV, LassoLars, LassoLarsCV, Lasso, ElasticNet, ElasticNetCV, BayesianRidge
from keras.models import Sequential
from keras.layers import Dense
%matplotlib inline

from collections import Counter

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Load training and test sets (assumes you have these in current working directory)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# Split training set into X and y (removing first column containing IDs)
X_train = train.iloc[:, 1:-1]
y_train = train.iloc[:, -1]

In [13]:
# Remove first column to make predictions
X_test = test.iloc[:, 1:]
X_test.head()

Unnamed: 0,Feat 1,Feat 2,Feat 3,Feat 4,Feat 5,Feat 6,Feat 7,Feat 8,Feat 9,Feat 10,...,Feat 242,Feat 243,Feat 244,Feat 245,Feat 246,Feat 247,Feat 248,Feat 249,Feat 250,Feat 251
0,0.999849,0.174118,0.999819,0.997841,0.133333,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.728471,0.054397,0.649,0.416164,0.053998,0.667391
1,0.999958,0.164706,1.0,0.996741,0.066667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.497255,0.037736,0.375,0.165514,0.101973,0.50665
2,0.999666,0.174118,0.999479,0.997376,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.688941,0.019309,1.0,0.192069,0.1207,0.498784
3,0.999735,0.174118,0.999655,0.997173,0.133333,0.0,0.0,0.0,0.363636,0.166667,...,0.0,0.0,0.0,0,0.654118,0.019089,0.333,0.451252,0.16418,0.774466
4,0.999806,0.164706,0.999551,0.997234,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.627451,0.160433,0.882,0.147407,0.0,0.48124


In [4]:
# Define function to compute RMSE
def scoreRMSE(predictor, X, true_y):
    predictions = predictor.predict(X)
    return np.sqrt(mean_squared_error(predictions, true_y))

In [11]:
def remove_single_feature(X_train, X_test):
    cols = X_train.columns
    single_val = []
    for index in cols:
        if (len(X_train[index].unique()) == 1):
            single_val.append(index)

    test_val = []
    for index in single_val:
        if (len(X_test[index].unique()) == 1):
            test_val.append(index)
            
    single_feature = list(set(single_val).intersection(set(test_val)))
    
    return single_feature

In [14]:
single_feature = remove_single_feature(X_train, X_test)

In [15]:
X_single_train = X_train.drop(columns=single_feature)
X_single_test = X_test.drop(columns=single_feature)

In [19]:
num_features = X_train.shape[1]

In [20]:
num_s_features = X_single_train.shape[1]

In [38]:
# there is an optional learning rate
# loss : {‘linear’, ‘square’, ‘exponential’}, optional (default=’linear’)

adaBoost = AdaBoostRegressor(n_estimators=num_s_features, learning_rate=0.01, loss='linear')

In [39]:
adaBoost.fit(X_single_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=0.01, loss='linear',
         n_estimators=212, random_state=None)

In [40]:
scoreRMSE(adaBoost, X_single_train, y_train)

0.027059150919346162

In [59]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor

params = {
    'n_estimators': list(range(1, num_s_features, 10)),
    'learning_rate': [0.001, 0.01, 0.1, 1]
}

adaBoostGrid = AdaBoostRegressor()

# cross validation
clf = GridSearchCV(estimator=adaBoostGrid, param_grid=params, cv=10)

clf.fit(X_single_train, y_train)

adaBoost = AdaBoostRegressor(n_estimators=clf.best_params_['n_estimators'], 
                             learning_rate=clf.best_params_['learning_rate'])

adaBoost.fit(X_single_train, y_train)

adaBoostRMSE = scoreRMSE(adaBoost, X_single_train, y_train)

In [66]:
ada_pred = adaBoost.predict(X_single_test)

In [67]:
# Format predictions to be compatible with Kaggle upload
sample_submission = pd.DataFrame(data=ada_pred, columns=['Predicted'])
sample_submission.insert(0, "Id", range(1, 1 + X_single_test.shape[0]))
sample_submission['Id'] = sample_submission['Id'].astype(str)
sample_submission.head()

# Save predictions to .csv file for upload to Kaggle
sample_submission.to_csv("adaboost_cv.csv", index=False)