### Model Build

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import datetime

### Data Preparation

In [2]:
print('Starting data preparation at: ', datetime.datetime.now())
train = pd.read_csv('Hold/train2.csv')
test = pd.read_csv('Hold/test2.csv')

# Saves product and customer pair for submissions later
UP = test.iloc[:, :2]

# Saves target variable
y = train.Purchase

# Drops redundant columns
train_ = train.drop(['Purchase', 'City_Category', 'User_ID', 'Product_ID', 'High'], axis=1)
test_ = test.drop(['City_Category', 'User_ID', 'Product_ID'], axis=1)

# Searches for columns with nulls
for col in test_.columns:
    if sum(test_[col].isnull()) > 1:
        print('Column ', col, ' contains nans')
        
        # Imputes the average for the column found to contain nulls i.e. std Product
        test_[col].fillna((test_[col].mean()), inplace=True)
        
        
# Train test split        
print('Splitting data into test and training sets')
X_train, X_test, y_train, y_test = train_test_split(train_, y, test_size = .3, random_state=42)

Starting data preparation at:  2018-04-12 12:49:27.019347
Column  NumPurchasesP  contains nans
Column  AvgPurchaseP  contains nans
Splitting data into test and training sets


### Linear Regression

In [4]:
# Ridge regression
ridge = Ridge()
ridge.fit(X_train, y_train)
print('Ridge model substantiated and fitted at: ', datetime.datetime.now())

print('Traing root mean squared error for ridge model')
y_pred_train = ridge.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = ridge.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

Ridge model substantiated and fitted at:  2018-04-12 12:49:39.308805
Traing root mean squared error for ridge model
Train RMSE: 2473.331373581323
Test RMSE: 2488.449923596339


In [5]:
# Grid search on Ridge
print('Starting grid search on Ridge at: ', datetime.datetime.now())

ridge = Ridge()

# Use a grid over parameters of interest
param_grid = { 
           "alpha" : [.001, .01, .1, 1, 10, 100, 1000, 1000000]
}
CV_ridge = GridSearchCV(estimator=ridge, param_grid=param_grid, cv= 3)
CV_ridge.fit(X_train, y_train)
print (CV_ridge.best_params_)
bp = CV_ridge.best_params_

print('Gridsearch for ridge alpha performed at: ', datetime.datetime.now())
print('Best alpha is: ', CV_ridge.best_params_)

ridge = Ridge(alpha=bp['alpha'])

ridge.fit(X_train, y_train)

print('Traing root mean squared error for ridge model with optimized alpha')
y_pred_train = ridge.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = ridge.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

print('Ending grid search on Ridge at: ', datetime.datetime.now())

Starting grid search on Ridge at:  2018-04-12 12:49:41.904868
{'alpha': 1}
Gridsearch for ridge alpha performed at:  2018-04-12 12:49:47.621140
Best alpha is:  {'alpha': 1}
Traing root mean squared error for ridge model with optimized alpha
Train RMSE: 2473.331373581323
Test RMSE: 2488.449923596339
Ending grid search on Ridge at:  2018-04-12 12:49:47.752110


In [8]:
# Grid search on Ridge
print('Starting grid search on Ridge at: ', datetime.datetime.now())

lasso = Lasso()

# Use a grid over parameters of interest
param_grid = { 
           "alpha" : [.001, .01, .1, 1, 10, 100, 1000, 1000000]
}
CV_ridge = GridSearchCV(estimator=lasso, param_grid=param_grid, cv= 3)
CV_ridge.fit(X_train, y_train)
print (CV_ridge.best_params_)
bp = CV_ridge.best_params_

print('Gridsearch for ridge alpha performed at: ', datetime.datetime.now())
print('Best alpha is: ', CV_ridge.best_params_)

lasso = Lasso(alpha=bp['alpha'])

lasso.fit(X_train, y_train)

print('Traing root mean squared error for ridge model with optimized alpha')
y_pred_train = lasso.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = lasso.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

print('Ending grid search on Ridge at: ', datetime.datetime.now())

Starting grid search on Ridge at:  2018-04-10 18:03:55.259495
{'alpha': 0.01}
Gridsearch for ridge alpha performed at:  2018-04-10 18:04:01.664906
Best alpha is:  {'alpha': 0.01}
Traing root mean squared error for ridge model with optimized alpha
Train RMSE: 2473.3313730417294
Test RMSE: 2488.4499165353836
Ending grid search on Ridge at:  2018-04-10 18:04:01.961527


In [21]:
# Ridge regression submission
y_pred = ridge.predict(test_)
y_pred = pd.DataFrame(y_pred)
submission = pd.concat([UP, y_pred], axis=1)
submission = submission.rename(columns={0: 'Purchase'})
submission.to_csv('Output/Ridge.csv')

print('Test submission file saved to Ridge.csv')

### Random Forest

In [9]:
# Random Forest Regression
print('Starting random forest model at: ', datetime.datetime.now())

rfr = RandomForestRegressor(n_jobs=-1, oob_score= True, random_state = 0)
rfr.fit(X_train, y_train)
print('Random forest model substantiated and fitted at: ', datetime.datetime.now())

print('Root mean squared error for random forest model:')
y_pred_train = rfr.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = rfr.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

Starting random forest model at:  2018-04-10 18:14:43.808584


  warn("Some inputs do not have OOB scores. "


Random forest model substantiated and fitted at:  2018-04-10 18:14:54.523928
Root mean squared error for random forest model:
Train RMSE: 1090.632973035505
Test RMSE: 2599.481600363038


In [26]:
# Creating feature importance data frame
importance = list(rfr.feature_importances_)
feature = list(X_train.columns)
feature_importance = pd.DataFrame({'feature': feature, 'importance': importance})

print('Feature importance determined from random forest')
print(feature_importance.sort_values('importance', ascending=False))

Feature importance determined from random forest
                       feature  importance
14                AvgPurchaseP    0.753632
15                   PropHighU    0.069045
13                AvgPurchaseU    0.033741
12               NumPurchasesP    0.030617
11               NumPurchasesU    0.028803
2                   Occupation    0.017175
6           Product_Category_2    0.013142
1                          Age    0.010985
3   Stay_In_Current_City_Years    0.010405
7           Product_Category_3    0.009527
5           Product_Category_1    0.008206
4               Marital_Status    0.003725
0                       Gender    0.003268
9                            A    0.002784
8                            C    0.002647
10                           B    0.002298


In [29]:
# Creating submission for RFR
y_pred = rfr.predict(test_)
y_pred = pd.DataFrame(y_pred)
submission = pd.concat([UP, y_pred], axis=1)
submission = submission.rename(columns={0: 'Purchase'})
submission.to_csv('Output/RFR.csv')

In [31]:
# Running random search on RFR
print('Starting random search on RFR at: ', datetime.datetime.now())

param_grid = { 
              "n_estimators": np.arange(200, 500, 50),
              "max_features" : ['auto'],
              "min_samples_leaf": np.arange(10,30,5),
              'max_depth' : np.arange(20, 60, 10)
}

rfr = RandomForestRegressor(n_jobs=-1, oob_score= True, random_state = 0)
print('classifier substantiated')
    
CV_rfr = RandomizedSearchCV(estimator=rfr, param_distributions=param_grid, cv=3)
print('search substantiated')
    
print("Start random fit: ", datetime.datetime.now())
CV_rfr.fit(X_train, y_train)
print("End random fit: ", datetime.datetime.now())

print('Best Params: ', CV_rfr.best_params_)
bp = CV_rfr.best_params_ # best parameters

rfr = RandomForestRegressor(
n_estimators = bp['n_estimators'], 
max_features= bp['max_features'],
min_samples_leaf= bp['min_samples_leaf'], 
max_depth=bp['max_depth'])
    
print('Classifier with best parameters set')
     
print("Start rfr fit: ", datetime.datetime.now())
rfr.fit(X_train, y_train)
print("End rfr fit: ", datetime.datetime.now())

print('RMSE for optimized rfr')
y_pred_train = rfr.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = rfr.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

print('Ending random search on RFR at: ', datetime.datetime.now())

classifier substantiated
search substantiated
Start random fit:  2018-03-29 22:14:27.473362
End random fit:  2018-03-29 23:36:12.473457
Best Params:  {'n_estimators': 450, 'min_samples_leaf': 25, 'max_features': 'auto', 'max_depth': 20}


In [11]:
bp = {'n_estimators': 450, 'min_samples_leaf': 25, 'max_features': 'auto', 'max_depth': 20}

rfr = RandomForestRegressor(
n_estimators = bp['n_estimators'], 
max_features= bp['max_features'],
min_samples_leaf= bp['min_samples_leaf'], 
max_depth=bp['max_depth'])
    
print('Classifier with best parameters set')
     
print("Start rfr fit: ", datetime.datetime.now())
rfr.fit(X_train, y_train)
print("End rfr fit: ", datetime.datetime.now())

print('RMSE for optimized rfr')
y_pred_train = rfr.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = rfr.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

print('Ending random search on RFR at: ', datetime.datetime.now())

Classifier with best parameters set
Start rfr fit:  2018-04-10 18:20:32.677055
End rfr fit:  2018-04-10 18:36:42.647419
RMSE for optimized rfr
Train RMSE: 2229.8058025971163
Test RMSE: 2439.522374395774
Ending random search on RFR at:  2018-04-10 18:37:39.082009


In [35]:
# Creating submission for RFR optimized from random search
y_pred = rfr.predict(test_)
y_pred = pd.DataFrame(y_pred)
submission = pd.concat([UP, y_pred], axis=1)
submission = submission.rename(columns={0: 'Purchase'})
submission.to_csv('RFR2.csv')
print('Test submission for RFR saved to RFR2.csv')

Test submission for RFR saved to RFR2.csv


In [37]:
# Running grid search on RFR
print('Starting grid search on RFR at: ', datetime.datetime.now())

param_grid = { 
              "n_estimators": np.arange(200, 500, 20),
              "max_features" : ['auto'],
              "min_samples_leaf": np.arange(10,30,2),
              'max_depth' : np.arange(20, 60, 5)
}

rfr = RandomForestRegressor(n_jobs=-1, oob_score= True, random_state = 0)
print('classifier substantiated')
    
CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=3)
print('search substantiated')
    
print("Start random fit: ", datetime.datetime.now())
CV_rfr.fit(X_train, y_train)
print("End random fit: ", datetime.datetime.now())

print('Best Params: ', CV_rfr.best_params_)
bp = CV_rfr.best_params_ # best parameters

rfr = RandomForestRegressor(
n_estimators = bp['n_estimators'], 
max_features= bp['max_features'],
min_samples_leaf= bp['min_samples_leaf'], 
max_depth=bp['max_depth'])
    
print('Classifier with best parameters set')
     
print("Start rfr fit: ", datetime.datetime.now())
rfr.fit(X_train, y_train)
print("End rfr fit: ", datetime.datetime.now())

print('RMSE for optimized rfr')
y_pred_train = rfr.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))
f
y_pred = rfr.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))


print('Starting grid search on RFR at: ', datetime.datetime.now())

classifier substantiated
search substantiated
Start random fit:  2018-03-30 00:14:08.745511


KeyboardInterrupt: 

In [None]:
# Creating submission csv for grid search optimized rfr model
y_pred = rfr.predict(test_)
y_pred = pd.DataFrame(y_pred)
submission = pd.concat([UP, y_pred], axis=1)
submission = submission.rename(columns={0: 'Purchase'})
submission.to_csv('Output/RFR3.csv')
print('Test submission for RFR saved to RFR3.csv')

### Multi-layer Perceptron

In [12]:
# MLPR 
print('Starting MLPR build at: ', datetime.datetime.now())
mlp = MLPRegressor(random_state = 0) 
mlp.fit(X_train, y_train)
print('mlpr model substantiated and fitted at: ', datetime.datetime.now())

print('RMSE for MLPR model: ')
y_pred_train = mlp.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = mlp.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

Starting MLPR build at:  2018-04-10 20:17:55.069664
mlpr model substantiated and fitted at:  2018-04-10 20:21:59.215550
RMSE for MLPR model: 
Train RMSE: 2467.6250463142665
Test RMSE: 2484.1670203047875


In [None]:
# Creating submission csv for MLPR
y_pred_submission = mlp.predict(test_)
y_pred_DF = pd.DataFrame(y_pred_submission)
submission = pd.concat([UP, y_pred_DF], axis=1)
submission = submission.rename(columns={0: 'Purchase'})
submission.to_csv('Output/MLPR1.csv')
print('Sumbission for MLPR model saved to MLPR1.csv')

In [20]:
# Running random search on MLPR
print('Starting MLPR random search at: ', datetime.datetime.now())

param_grid = { 
    "hidden_layer_sizes" : [1 ,2, 3],
    "alpha" : [.0001, .001, .01],
    "batch_size" : [1, 10, 50, 100, 250],
    "max_iter"  : np.arange(1250, 2250, 250)
            }

mlp = MLPRegressor(random_state = 0) 

rs_mlp = RandomizedSearchCV(estimator=mlp, param_distributions=param_grid, cv= 3)

rs_mlp.fit(X_train, y_train)
print('data fitted')
    
bp = rs_mlp.best_params_ # best parameters
print(rs_mlp.best_params_)

    
mlp = MLPRegressor(random_state=0, hidden_layer_sizes = bp['hidden_layer_sizes'], 
alpha=bp['alpha'],
batch_size=bp['batch_size'], 
max_iter=bp['max_iter'])

print('regressor substantiated with best params')
    
mlp.fit(X_train, y_train)
print('data fitted with best params')

print('RMSE for optimized mlpr')
y_pred_train = mlp.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = mlp.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

print('Ending MLPR random search at: ', datetime.datetime.now())


Starting MLPR random search at:  2018-04-10 23:22:27.061922
data fitted
{'max_iter': 1750, 'hidden_layer_sizes': 3, 'batch_size': 10, 'alpha': 0.0001}
regressor substantiated with best params
data fitted with best params
RMSE for optimized mlpr
Train RMSE: 2473.719212829708
Test RMSE: 2489.7192562620767
Ending MLPR random search at:  2018-04-11 03:38:28.556120


In [16]:
bp = {'max_iter': 1250, 'hidden_layer_sizes': 2, 'batch_size': 1, 'alpha': 0.001}

mlp = MLPRegressor(random_state=0, hidden_layer_sizes = bp['hidden_layer_sizes'], 
alpha=bp['alpha'],
batch_size=bp['batch_size'], 
max_iter=bp['max_iter'])

print('regressor substantiated with best params')
    
mlp.fit(X_train, y_train)
print('data fitted with best params')

print('RMSE for optimized mlpr')
y_pred_train = mlp.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = mlp.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

print('Ending MLPR random search at: ', datetime.datetime.now())

regressor substantiated with best params
data fitted with best params
RMSE for optimized mlpr
Train RMSE: 2483.3996341888383
Test RMSE: 2499.4141435559336
Ending MLPR random search at:  2018-04-10 22:00:23.515630


In [None]:
# Creating submission csv for random search optimized MLPR
y_pred_submission = mlp.predict(test_)
y_pred_DF = pd.DataFrame(y_pred_submission)
submission = pd.concat([UP, y_pred_DF], axis=1)
submission = submission.rename(columns={0: 'Purchase'})
submission.to_csv('Output/MLPR2.csv')
print('Sumbission for xgr model saved to MLPR2.csv')

### Gradient Boosting Regressor

In [13]:
# GBR
print('Starting GBR model building at: ', datetime.datetime.now())

gbr = GradientBoostingRegressor(random_state =0)
print('regressor substaintiated')

gbr.fit(X_train, y_train)
print('data fitted')

print('RMSE for optimized mlpr')
y_pred_train = gbr.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = gbr.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

print('Ending GBR random search at: ', datetime.datetime.now())

Starting GBR model building at:  2018-04-10 20:22:01.316848
regressor substaintiated
data fitted
RMSE for optimized mlpr
Train RMSE: 2432.4198217573903
Test RMSE: 2453.141669366565
Ending GBR random search at:  2018-04-10 20:22:44.575913


In [None]:
# Creating submission csv for GBR
y_pred_submission = gbr.predict(test_)
y_pred_DF = pd.DataFrame(y_pred_submission)
submission = pd.concat([UP, y_pred_DF], axis=1)
submission = submission.rename(columns={0: 'Purchase'})
submission.to_csv('Output/GBR1.csv')
print('Sumbission for gbr model saved to GBR1.csv')

In [21]:
# Running random search on GBR
print('Starting GBR random search at: ', datetime.datetime.now())

param_grid = {'learning_rate':  [.01, .05, 0.15, 0.3, 0.5],
              'max_depth': [8, 9, 10, 11, 12],
              'min_samples_leaf': [ 15, 18, 21, 24],
              'max_features': np.arange(2, 17, 3)
              }  

gbr = GradientBoostingRegressor(random_state =0)
print('regressor substaintiated')

CV_gbr = RandomizedSearchCV(estimator=gbr, param_distributions=param_grid, cv=3)
print('search substantiated')

CV_gbr.fit(X_train, y_train)
print('data fitted')
    
bp = CV_gbr.best_params_ # best parameters
print(CV_gbr.best_params_)

bp = CV_gbr.best_params_ # best parameters
    
gbr = GradientBoostingRegressor(learning_rate = bp['learning_rate'], 
    max_features=bp['max_features'],
    min_samples_leaf=bp['min_samples_leaf'], 
    max_depth=bp['max_depth'])
    
print('regressor substantiated with best params')
    
gbr.fit(X_train, y_train)
print('data fitted with best params') 

print('RMSE for optimized mlpr')
y_pred_train = gbr.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = gbr.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

print('Ending GBR random search at: ', datetime.datetime.now())


Starting GBR random search at:  2018-04-11 03:38:28.587874
regressor substaintiated
search substantiated
data fitted
{'min_samples_leaf': 15, 'max_features': 8, 'max_depth': 12, 'learning_rate': 0.05}
regressor substantiated with best params
data fitted with best params
RMSE for optimized mlpr
Train RMSE: 2153.181622905302
Test RMSE: 2422.9038270298124
Ending GBR random search at:  2018-04-11 05:52:57.668978


In [24]:
# Creating random search optimized submission csv for GBR
y_pred_submission = gbr.predict(test_)
y_pred_DF = pd.DataFrame(y_pred_submission)
submission = pd.concat([UP, y_pred_DF], axis=1)
submission = submission.rename(columns={0: 'Purchase'})
submission.to_csv('Output/GBR411.csv')
print('Sumbission for gbr model saved to GBR2.csv')

Sumbission for gbr model saved to GBR2.csv


In [None]:
# Running grid search on GBR
print('Starting GBR grid search at: ', datetime.datetime.now())

param_grid = {'learning_rate':  [.01, .05, 0.15, 0.3, 0.5],
              'max_depth': [8, 9, 10, 11, 12],
              'min_samples_leaf': [ 15, 18, 21, 24],
              'max_features': np.arange(25, 50, 5)
              }  

gbr = GradientBoostingRegressor(random_state =0)
print('regressor substaintiated')

CV_gbr = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=3)
print('search substantiated')

CV_gbr.fit(X_train, y_train)
print('data fitted')
    
bp = CV_gbr.best_params_ # best parameters
print(CV_gbr.best_params_)

bp = CV_gbr.best_params_ # best parameters
    
gbr = GradientBoostingRegressor(learning_rate = bp['learning_rate'], 
    max_features=bp['max_features'],
    min_samples_leaf=bp['min_samples_leaf'], 
    max_depth=bp['max_depth'])
    print('regressor substantiated with best params')
    
gbr.fit(X_train, y_train)
print('data fitted with best params') 

print('RMSE for optimized mlpr')
y_pred_train = gbr.predict(X_train)
rmseTrain = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE: {}".format(rmseTrain))

y_pred = gbr.predict(X_test)
rmseTest = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: {}".format(rmseTest))

print('Ending GBR random search at: ', datetime.datetime.now())

In [None]:
# Creating submission csv for grid search optimized gbr model
y_pred_submission = gbr.predict(test_)
y_pred_DF = pd.DataFrame(y_pred_submission)
submission = pd.concat([UP, y_pred_DF], axis=1)
submission = submission.rename(columns={0: 'Purchase'})
submission.to_csv('Output/GBR3.csv')
print('Sumbission for gbr model saved to GBR3.csv')