# Introduction

This notebook demonstrates a basic approach to machine learning.  

The notebook reads in the 'cold starts' and the predictions file generated from R, does a bit more data cleaning (which turned out to be necessary), fits two models - a GridSearchCV'ed Random Forest and a model from TPOT - automated hyperparameter tuning. (For more information on TPOT, see here: https://github.com/EpistasisLab/tpot) The model from TPOT is not run in the final iteration - it took too long on my machine.    

Also note that although random forest tends to provide a 'holdout set' for free due to the randomization effect (using 2/3rds of the data), I prefer to hold an additional holdoutset just for my own testing.  For a more detailed discussion, see: https://datascience.stackexchange.com/questions/6510/does-modeling-with-random-forests-requre-cross-validation 

The code also compares the gain in predictive accuracy from using the earlier generated clusters versus modeling the entire dataset as one.

**Note**

The code is geared towards readability rather than optimization.  

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from tpot import TPOTRegressor

df = pd.read_csv('cold_starts.csv')
df.drop_duplicates(inplace=True)

## Data cleaning on the cold_starts dataset

In [2]:
## Remove unecessary columns
Style_display_code = df.pop('Style_display_code')
SesnyrCd = df.pop('SesnYrCd')

## Get rid of pesky np.infs and NaN's
df = df.apply(lambda x: x[np.isfinite(x)].dropna())
df.dropna(inplace=True)

## Split of target - first twelve weeks of spending!
y = df.pop('first_twelve_weeks')

df.head()

Unnamed: 0,bookings,lagged_bookings,age_desc.ADULT,age_desc.GRADE.SCHOOL,age_desc.INFANT,age_desc.PRE.SCHOOL,age_desc.TODDLER,gender_desc.FEMALE,gender_desc.MALE,gender_desc.UNISEX,...,product_family.SERIES.Z668,product_family.SERIES.Z767,product_family.SERIES.Z840,product_family.SERIES.Z852,product_family.SERIES.Z927,product_family.SERIES.Z94,product_family.SERIES.Z943,product_family.SERIES.Z970,product_family.SERIES.Z989,cluster
0,418625,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
1,66127,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,1519206,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3141,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,56666,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Run Grid Search and Evaluate Clusters

In [3]:
def run_grid_search(cluster = None, use_clusters = True, use_TPOT = False):
        
    ## Select Rows
    if use_clusters:
        row_index = df.loc[df['cluster'] == cluster].index
        
    ##Generate train/test splits
        X_train, X_test, y_train, y_test = train_test_split(
        df.loc[row_index], y.loc[row_index], test_size=0.8, random_state=0)
    
    else:
        row_index=np.arange(df.shape[0]) # Just for printing purposes
        
        X_train, X_test, y_train, y_test = train_test_split(
        df, y, test_size=0.8, random_state=0)
    
    ## Using TPOT
    if use_TPOT:
        tpot = TPOTRegressor(generations=3, population_size=50, verbosity=2)
        tpot.fit(X_train, y_train)
        
        print('TPOT Model: \n')
        ## Print the pipeline TPOT discovered
        tpot.export('tpot_nike_pipeline.py')
        
        predicted = tpot.predict(X_test)
        
        r2 = r2_score(y_test, predicted)
        print("Cluster = {}, num_obs = {}, r2 on holdout set = {}".format(cluster, len(row_index), r2))

        
        ## Return the whole object
        return tpot
    
    ## Using Random Forest
    else:
        ##Supply a couple of parameters to tune
        tuned_parameters = [{'max_features': ['auto', 'sqrt','log2'], 'n_estimators': [10, 20, 50, 100]}]

        ## Run gridsearch with 3 cuts
        clf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=3)
        clf.fit(X_train, y_train)

        print("Cluster =  {}, Best parameters set found on development set:{}".format(cluster, clf.best_params_))

        ## Train using best_params and compare results on holdout set
        rf = RandomForestRegressor(n_estimators=clf.best_params_['n_estimators'], 
                                   max_features = clf.best_params_['max_features'], 
                                   oob_score=True, random_state = 0 )
        rf.fit(X_train,y_train)

        ## Final r2 on holdout set
        predicted = rf.predict(X_test)
        
        
    r2 = r2_score(y_test, predicted)
    
    print("Cluster = {}, num_obs = {}, r2 on holdout set = {}".format(cluster, len(row_index), r2))
    return rf
    

### Lets first run the code on the entire dataset, using RF, ignoring the clustering approach.

In [4]:
model_full_data_set = run_grid_search(use_clusters=False)

Cluster =  None, Best parameters set found on development set:{'max_features': 'auto', 'n_estimators': 20}
Cluster = None, num_obs = 5073, r2 on holdout set = 0.35404994219140706


### Now lets run it on a cluster by cluster basis - A notable improvement (albeit from a low base!)

In [5]:
models_dict = {}
clusters = df['cluster'].unique()
for cluster in clusters:
    models_dict['cluster_' + str(cluster)] = run_grid_search(cluster)

Cluster =  4.0, Best parameters set found on development set:{'max_features': 'auto', 'n_estimators': 50}
Cluster = 4.0, num_obs = 508, r2 on holdout set = 0.4956905788998569
Cluster =  1.0, Best parameters set found on development set:{'max_features': 'sqrt', 'n_estimators': 100}
Cluster = 1.0, num_obs = 3324, r2 on holdout set = 0.4896025217549459
Cluster =  2.0, Best parameters set found on development set:{'max_features': 'log2', 'n_estimators': 100}
Cluster = 2.0, num_obs = 1241, r2 on holdout set = 0.4311277788123786


### Compare with a run over many sklearn regressor algorithms (exlcuding XGBoost, which should be first among equals)

Just because I think hyperparameter tuning should be automatic, here is a comparison to TPOT results.

In [None]:
## Not run

## Using no clustering
run_grid_search(use_clusters=False, use_TPOT = True)

## Using clusters
for cluster in clusters:
    run_grid_search(cluster, use_TPOT = True)

## Generate Predictions for the Scoring Dataset

### Read in our predictions dataframe and do some additional cleaning

In [6]:
## Pivot the SesnYrCd variable
scorings = pd.read_csv('predictions.csv')
to_pivot = scorings[['Style_display_code', 'SesnYrCd','bookings']].copy() # Super inefficient, but gets the job done

scorings = scorings.drop(['SesnYrCd','bookings'], axis=1)
scorings = scorings.drop_duplicates()

to_pivot = to_pivot.pivot(index='Style_display_code', columns='SesnYrCd',values='bookings').drop_duplicates()
to_pivot.reset_index(inplace=True)

to_pivot = to_pivot.rename(index=str, columns={"SP2016": "lagged_bookings", "SU2016": "bookings"}).fillna(0)
scorings = pd.merge(scorings, to_pivot[['Style_display_code','bookings','lagged_bookings']], how='left')

## Extract only rows that have either bookings or lagged_bookings
scorings = scorings.loc[scorings[['bookings', 'lagged_bookings']].notnull().all(axis=1)]

## Filter out scorings that are already 0
scorings = scorings.loc[scorings['predictions'].isnull()]

## Add in cluster information 
df['Style_display_code'] = Style_display_code
scorings = pd.merge(scorings, df[['Style_display_code', 'cluster']], on = 'Style_display_code', how='left')
scorings['cluster'] = scorings['cluster'].fillna(0)
scorings = scorings.drop_duplicates()

print('Number of observations to be scored: ', scorings.shape[0])
scorings.head()

Number of observations to be scored:  1235


Unnamed: 0,Style_display_code,predictions,age_desc.ADULT,age_desc.GRADE.SCHOOL,age_desc.INFANT,age_desc.PRE.SCHOOL,age_desc.TODDLER,gender_desc.FEMALE,gender_desc.MALE,gender_desc.UNISEX,...,product_family.SERIES.Z840,product_family.SERIES.Z852,product_family.SERIES.Z927,product_family.SERIES.Z94,product_family.SERIES.Z943,product_family.SERIES.Z970,product_family.SERIES.Z989,bookings,lagged_bookings,cluster
0,NCN8564,,15.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187466.0,190946.0,0.0
1,YIQ8866,,0.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,304971.0,368043.0,0.0
2,TRC2287,,0.0,14.0,0.0,0.0,0.0,0.0,14.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,342937.0,405211.0,0.0
3,VWI5319,,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3166.0,428051.0,1.0
4,XEQ7328,,0.0,0.0,0.0,43.0,0.0,0.0,43.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1291.0,9655.0,0.0


## Fill in our predictions column with our models

In [7]:
## Use the full data set to predict objects that are not in a cluster
mask = scorings['predictions'].isnull() & scorings['cluster']==0
scorings['predictions'].loc[mask] = model_full_data_set.predict(scorings[mask].iloc[:, 2:])

## Predict any scorings that are in a cluster
for cluster in clusters:
    mask = scorings['predictions'].isnull() & scorings['cluster'] == cluster
    
    if scorings[mask].shape[0] > 0:
        scorings['predictions'].loc[mask] = models_dict['cluster_' + str(cluster)].predict(scorings[mask].iloc[:, 2:])

scorings[['Style_display_code', 'predictions']].head()

Unnamed: 0,Style_display_code,predictions
0,NCN8564,3532.65
1,YIQ8866,3887.8
2,TRC2287,3532.65
3,VWI5319,7414.93
4,XEQ7328,1008.15


## Save down to CSV for submission to Jin

In [8]:
scorings[['Style_display_code', 'predictions']].to_csv('scoring_list.csv')