# JON'S HiML Competition RFG_Log Model (v 2.5)
## Make Date: 04/13/18
This model will be updated to include ideas from EDA onto the RFG model. 
Some base features include:  

1. This ML model was built using scikit's RandomForestRegressor model.  
1. Automatically find the best combination of input parameters using the combo-picker function    
1. Let's implement log transform on the quantities to improve score.
1. From Monday (4/9/17) meetup, we know Customer 7 skews the data. So what if we make 2 different models: 1 for Customer 7 and another for everyone else?



In [2]:
#Some initialization procedures:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# load in data files
# FILE_DIR = '../input/hawaiiml-data'
# for f in os.listdir(FILE_DIR):
#     print('{0:<30}{1:0.2f}MB'.format(f, 1e-6*os.path.getsize(f'{FILE_DIR}/{f}')))
FILE_DIR = '../Sprint09alt_Machine_Learning_Hawaii_Kaggle_Competition'
df_train = pd.read_csv(f'train.csv', encoding='ISO-8859-1') #write training data to dataframe
df_test = pd.read_csv(f'test.csv', encoding='ISO-8859-1') # Read the test data

#define the error function:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean((np.log1p(y_true) - np.log1p(y_pred))**2))

# Build Features

In [2]:
#convert date strings to datatime datatypes:
df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

df_train = df_train.assign(day_of_week = df_train.date.dt.dayofweek)
df_test = df_test.assign(day_of_week = df_test.date.dt.dayofweek)

In [3]:
df_train = df_train.assign(is_day0To2 = 0)
df_train['is_day0To2'] = 0
df_train.loc[(df_train['day_of_week'] < 3),['is_day0To2']]  = 1

df_train = df_train.assign(is_day3 = 0)
df_train['is_day3'] = 0
df_train.loc[(df_train['day_of_week'] == 3),['is_day3']]  = 1

df_train = df_train.assign(is_day4 = 0)
df_train['is_day4'] = 0
df_train.loc[(df_train['day_of_week'] == 4),['is_day4']]  = 1

df_train = df_train.assign(is_day6 = 0)
df_train['is_day6'] = 0
df_train.loc[(df_train['day_of_week'] == 6),['is_day6']]  = 1

In [4]:
df_train = df_train.loc[(df_train['unit_price'] > 0) & (df_train['unit_price'] < 10000) ]
df_train = df_train.assign(unit_price_log1p = np.log1p(df_train.unit_price))
df_test = df_test.assign(unit_price_log1p = np.log1p(df_test.unit_price))

In [5]:
df_train = df_train.assign(uplog1p_quant1 = 0)
df_train.loc[(df_train['unit_price_log1p'] < 6.47774129795256),['uplog1p_quant1']]  = 1
df_test = df_test.assign(uplog1p_quant1 = 0)
df_test.loc[(df_test['unit_price_log1p'] < 6.47774129795256),['uplog1p_quant1']]  = 1

In [6]:
df_train['time'] = pd.to_datetime(df_train['time'])
df_test['time'] = pd.to_datetime(df_test['time'])
df_train = df_train.assign(hr = df_train.time.dt.hour)
df_test = df_test.assign(hr = df_test.time.dt.hour)

In [7]:
df_train = df_train.assign(invoice_id_log1p = np.log1p(df_train.invoice_id))
df_test = df_test.assign(invoice_id_log1p = np.log1p(df_test.invoice_id))

# Split Data Between Customer 7 and Everyone Else  

In [8]:
#split training data between customer 7 and everyone else:
df_train_7 = df_train.loc[df_train['customer_id'] == 7] #customer 7 only
df_train_no7 = df_train.loc[df_train['customer_id'] != 7] #everyone else

#split test data:
df_test_7 = df_test.loc[df_test['customer_id'] == 7] #customer 7 only
df_test_no7 = df_test.loc[df_test['customer_id'] != 7] #everyone else

# Define Training Target Data:
1. We want to predict the quantity data field.    
1. By convention, we define this target as 'y'.  
1. From model 2.02, we know that doing a log transform on the tartget reduces skewness and improves score.  

In [9]:
y = df_train.quantity
logy_7 = np.log1p(df_train_7.quantity) #take log of quantities
logy_no7 = np.log1p(df_train_no7.quantity) #take log of quantities

# Define ML Predictors
build a function to go through each combination of predictors

### Here is the list of columns we can choose predictors from. 

In [10]:
print('Column Names & Data Types: \n', df_train.dtypes)

Column Names & Data Types: 
 id                           int64
date                datetime64[ns]
time                datetime64[ns]
invoice_id                   int64
stock_id                     int64
customer_id                  int64
country                     object
description                 object
unit_price                 float64
quantity                     int64
day_of_week                  int64
is_day0To2                   int64
is_day3                      int64
is_day4                      int64
is_day6                      int64
unit_price_log1p           float64
uplog1p_quant1               int64
hr                           int64
invoice_id_log1p           float64
dtype: object


## Define feature combination picker

In [11]:
ls_AllPredictors = ['invoice_id', 'stock_id', 'customer_id', 'unit_price_log1p', 'uplog1p_quant1', 
                    'day_of_week', 'is_day6', 'hr']

# https://stackoverflow.com/questions/464864/how-to-get-all-possible-combinations-of-a-list-s-elements
from itertools import chain, combinations
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)  # allows duplicate elements
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
#build combos:
ls_PredictorCombos = [list(combo[1]) for combo in enumerate(powerset(ls_AllPredictors), 1)]
#Define combo getting function:
def GetX(comboID, adf_train):
    #get X values in adf_train for the given list of predictors
    return adf_train[ls_PredictorCombos[comboID]]

# Implement Random Forest Regressor Model on Each Predictor Combo
## Define Some Helper Functions

In [12]:
def Run_RFG(SeedVal, X, logy):
    #fit, predict, and then evaluate a model using passed in values (e.g. training set)
    #return back RMSLE value and trained model

    myModel = RandomForestRegressor(n_estimators=200, max_features = 'auto',n_jobs = -1)
    train_X, val_X, train_y, val_y = train_test_split(X, logy,random_state = SeedVal) #split training data into a test and train part
    myModel.fit(train_X, train_y)
    predicted_vals = np.expm1(myModel.predict(val_X)) #transform predicted values from log to "normal" Y value
    return rmsle(np.expm1(val_y), predicted_vals), myModel #include transform of val_Y values from log to "normal" Y value

In [131]:
def FindBestCombo_Model(adf_train, logy):
    #iteratively search for best parameter combinations to match X values in adf_train with logy
    #return best combo, error results of each model, training error, and model
    MinErr = 100000000
    df_Track = pd.DataFrame()
    for comboID in range(1,len(ls_PredictorCombos)):
        SeedVal = 6
        X = GetX(comboID, adf_train)
        TrainErr,myModel = Run_RFG(SeedVal,X,logy)
        df2 = pd.DataFrame([[comboID, TrainErr, ','.join(ls_PredictorCombos[comboID])]],columns=['ComboID','err','Preds'])
        if df_Track.shape[0] >0:
            df_Track = pd.concat([df2, df_Track])
        else:
            df_Track = df2.copy(deep = True)
        if TrainErr < MinErr:
            MinErr = TrainErr
    #         BestSeedVal = SeedVal
            bestComboID = comboID
            print ('Best Combo: ', comboID, ' Params: ', ls_PredictorCombos[comboID], ' Err: ', TrainErr)
    #train with best combo:
    X = GetX(bestComboID, adf_train)
    TrainErr,myModel = Run_RFG(0,X,logy)
    print ('fin')
    return ls_PredictorCombos[bestComboID], df_Track, TrainErr, myModel

### Build Customer 7 and Everyone Else Models:
1. Find Best Predictors
1. Define Separate Models

In [138]:
ls_mypredictors_7, df_Track_7, TrainErr_7, myModel_7 = FindBestCombo_Model(df_train_7, logy_7) #train customer 7 model
ls_mypredictors_no7, df_Track_no7,  TrainErr_no7, myModel_no7 = FindBestCombo_Model(df_train_no7, logy_no7) #train everyone else model

Best Combo:  1  Params:  ['invoice_id']  Err:  0.557969876249
Best Combo:  12  Params:  ['invoice_id', 'uplog1p_quant1']  Err:  0.557849261932
Best Combo:  14  Params:  ['invoice_id', 'is_day6']  Err:  0.557827684602
Best Combo:  15  Params:  ['invoice_id', 'hr']  Err:  0.557055701861
Best Combo:  17  Params:  ['stock_id', 'unit_price_log1p']  Err:  0.500776934454
Best Combo:  63  Params:  ['stock_id', 'unit_price_log1p', 'uplog1p_quant1']  Err:  0.500290695388
Best Combo:  130  Params:  ['stock_id', 'customer_id', 'unit_price_log1p', 'is_day6']  Err:  0.499158143675
fin
Best Combo:  1  Params:  ['invoice_id']  Err:  0.718939068669
Best Combo:  10  Params:  ['invoice_id', 'customer_id']  Err:  0.709790467302
Best Combo:  22  Params:  ['customer_id', 'unit_price_log1p']  Err:  0.592290890834
Best Combo:  73  Params:  ['customer_id', 'unit_price_log1p', 'uplog1p_quant1']  Err:  0.591829907902
fin


## Observations

In [139]:
display(df_Track_7.sort_values(by=['err']))
display(df_Track_no7.sort_values(by=['err']))

Unnamed: 0,ComboID,err,Preds
0,130,0.499158,"stock_id,customer_id,unit_price_log1p,is_day6"
0,139,0.500087,"stock_id,unit_price_log1p,uplog1p_quant1,is_day6"
0,63,0.500291,"stock_id,unit_price_log1p,uplog1p_quant1"
0,199,0.500752,"stock_id,customer_id,unit_price_log1p,uplog1p_..."
0,17,0.500777,"stock_id,unit_price_log1p"
0,58,0.501359,"stock_id,customer_id,unit_price_log1p"
0,65,0.501796,"stock_id,unit_price_log1p,is_day6"
0,128,0.501928,"stock_id,customer_id,unit_price_log1p,uplog1p_..."
0,198,0.527774,"stock_id,customer_id,unit_price_log1p,uplog1p_..."
0,208,0.527956,"stock_id,unit_price_log1p,uplog1p_quant1,day_o..."


Unnamed: 0,ComboID,err,Preds
0,73,0.591830,"customer_id,unit_price_log1p,uplog1p_quant1"
0,22,0.592291,"customer_id,unit_price_log1p"
0,149,0.597164,"customer_id,unit_price_log1p,uplog1p_quant1,is..."
0,75,0.598475,"customer_id,unit_price_log1p,is_day6"
0,235,0.600095,"invoice_id,customer_id,unit_price_log1p,uplog1..."
0,253,0.600310,"invoice_id,customer_id,unit_price_log1p,uplog1..."
0,187,0.602442,"invoice_id,customer_id,unit_price_log1p,day_of..."
0,237,0.603920,"invoice_id,customer_id,unit_price_log1p,day_of..."
0,246,0.603951,"customer_id,unit_price_log1p,uplog1p_quant1,da..."
0,213,0.604117,"customer_id,unit_price_log1p,uplog1p_quant1,da..."


# Train models using best found predictor combo

In [15]:
# Use separate models to diferentiate between customer 7 and everyone else.
from sklearn.ensemble import RandomForestRegressor
#best predictors:
ls_mypredictors_7 = ['stock_id', 'unit_price_log1p', 'is_day6']
ls_mypredictors_no7 = ['customer_id', 'unit_price_log1p', 'uplog1p_quant1'] 

#split training set into a subset of training and validation data:
train_X_7, val_X_7, train_y_7, val_y_7 = train_test_split(df_train_7[ls_mypredictors_7], logy_7, random_state = 0) #split training data into a test and train part
train_X_no7, val_X_no7, train_y_no7, val_y_no7 = train_test_split(df_train_no7[ls_mypredictors_no7], logy_no7, random_state = 0) #split training data into a test and train part

#define and fit models:
myModel_7 = RandomForestRegressor(n_estimators=1000, max_leaf_nodes=15000, max_features = 'auto',n_jobs = -1)
myModel_7.fit(train_X_7, train_y_7)
myModel_no7 = RandomForestRegressor(n_estimators=1000, max_leaf_nodes=15000, max_features = 'auto',n_jobs = -1)
myModel_no7.fit(train_X_no7, train_y_no7)
    
#make predictions using validation data:    
# df_train_7_pred = df_train_7.copy(deep = True) #copy test dataframe
val_X_7 = val_X_7.assign(pred_quantity = np.expm1(myModel_7.predict(val_X_7[ls_mypredictors_7])) )#predict and transform from log to "normal" Y value
# df_train_no7_pred = df_train_no7.copy(deep = True) #copy test dataframe
val_X_no7 = val_X_no7.assign(pred_quantity = np.expm1(myModel_no7.predict(val_X_no7[ls_mypredictors_no7]))) #predict and transform from log to "normal" Y value

#merge results
df_preds = pd.concat([val_X_7, val_X_no7]) #concatenate predictions into 1 dataframe
df_preds = pd.DataFrame (df_preds['pred_quantity'])
df_preds.sort_index(inplace=True)

df_preds['pred_quantity'] = round(df_preds['pred_quantity'],0) #round to nearest whole value

df_y = np.expm1(pd.concat([val_y_7,val_y_no7])) #concat y vals into 1 dataframe
df_y.sort_index(inplace=True)

print ('RMSLE score: ', rmsle(df_y, df_preds['pred_quantity']))

RMSLE score:  0.5682113765398783


In [162]:
print (myModel_7.feature_importances_)
print (myModel_no7.feature_importances_)

[ 0.64557884  0.34338557  0.01103559]
[ 0.62476342  0.37523658  0.        ]


# Submit Model's Predictions

## First, output model's predictions for test data set:

In [124]:
df_test_7_pred = df_test_7.copy(deep = True) #copy test dataframe    
df_test_7_pred['pred_quantity'] = np.expm1(myModel_7.predict(df_test_7[ls_mypredictors_7])) #predict and transform from log to "normal" Y value

df_test_no7_pred = df_test_no7.copy(deep = True) #copy test dataframe
df_test_no7_pred['pred_quantity'] = np.expm1(myModel_no7.predict(df_test_no7[ls_mypredictors_no7])) #predict and transform from log to "normal" Y value

df_preds = pd.concat([df_test_7_pred, df_test_no7_pred]) #concatenate predictions into 1 dataframe
df_preds['pred_quantity'] = round(df_preds['pred_quantity'],0) #round to nearest whole value

# # We will look at the predicted prices to ensure we have something sensible.
display(df_preds)

Unnamed: 0,id,date,time,invoice_id,stock_id,customer_id,country,description,unit_price,day_of_week,unit_price_log1p,uplog1p_quant1,invoice_id_log1p,pred_quantity
11,196540,2011-11-16,10:18,30,29,7,unspecified,circus parade lunch box,1.95,2,1.081805,1,3.433987,2.0
20,72496,2011-01-12,9:26,57,55,7,united kingdom,s/4 black mini rose candle in bowl,1.66,2,0.978326,1,4.060443,1.0
29,62221,2011-06-02,17:26,79,75,7,united kingdom,edwardian parasol natural,12.46,3,2.599722,1,4.382027,1.0
30,298410,2011-06-21,17:08,82,17,7,united kingdom,set of 60 i love london cake cases,1.25,1,0.810930,1,4.418841,2.0
33,163787,2011-03-22,16:55,92,87,7,united kingdom,hanging heart zinc t-light holder,2.08,1,1.124930,1,4.532599,2.0
36,207595,2011-06-10,14:01,99,93,7,united kingdom,charlotte bag suki design,2.46,4,1.241269,1,4.605170,76.0
38,215524,2011-03-09,16:32,107,101,7,united kingdom,red retrospot small milk jug,4.96,2,1.785070,1,4.682131,1.0
48,452939,2010-12-21,15:20,158,147,7,united kingdom,childs breakfast set circus parade,16.98,1,2.889260,1,5.068904,1.0
50,88914,2011-06-28,11:45,163,112,7,united kingdom,suki shoulder bag,4.13,1,1.635106,1,5.099866,2.0
58,425955,2011-08-31,15:32,196,178,7,united kingdom,red metal box top secret,16.63,2,2.869602,1,5.283204,2.0


## Next, submit predicted values

In [None]:
my_submission = pd.DataFrame({'Id': df_preds.id, 'quantity': df_preds.pred_quantity})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)