In [1]:
#standard packages
import pandas as pd
import numpy as np
#package for loop load time bar
from tqdm import tqdm
#statistical and machine learning packages
import scipy as sp
from sklearn.preprocessing import OneHotEncoder

#import scoring for machine learning
from sklearn import metrics
from sklearn.metrics import roc_curve, auc,precision_recall_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# packages for plotting 
import matplotlib.pyplot as plt
import seaborn as sns
#command to have graphs display in notebook
%matplotlib inline

In [2]:
#loading user order information
instacart_file=pd.read_csv('Capstone Project 1/Data/orders.csv')
df_orders=pd.DataFrame(instacart_file,)
df_orders.head(5)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [3]:
df_orders.nunique()

order_id                  3421083
user_id                    206209
eval_set                        3
order_number                  100
order_dow                       7
order_hour_of_day              24
days_since_prior_order         31
dtype: int64

In [4]:
#capture the last order information for each user
g = df_orders.groupby('user_id')
data_p1=g.last()
data_p1['order_from_last']=1

In [5]:
#capture the second to last order information for each user
data_p2=g.nth(-2)
data_p2['order_from_last']=2

In [6]:
#capture the third to last order information for each user
data_p3=g.nth(-3)
data_p3['order_from_last']=3

In [7]:
#capture the fourth to last order information for each user
data_p4=g.nth(-4)
data_p4['order_from_last']=4

In [8]:
#join all information into one data set
data_set=pd.concat([data_p1,data_p2,data_p3,data_p4])

In [9]:
data_set=data_set.reset_index()

In [10]:
#loading product reorder information
instacart_file2=pd.read_csv('Capstone Project 1/Data/order_products__prior.csv')
df_prod_orders=pd.DataFrame(instacart_file2,)
df_prod_orders.head()



Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


#### playing around with one hot for prod

In [11]:
#df_prod_orders.pivot(index='order_id',columns='product_id', values='reordered')

#########

In [12]:
#loading information for product names
instacart_products=pd.read_csv('Capstone Project 1/Data/products.csv')
df_prod=pd.DataFrame(instacart_products,)
df_prod.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [13]:
df_prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49688 entries, 0 to 49687
Data columns (total 4 columns):
product_id       49688 non-null int64
product_name     49688 non-null object
aisle_id         49688 non-null int64
department_id    49688 non-null int64
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [14]:
#merge dataframes to get user_id with product_id and reorder in same dataframe
#use inner to get the intersection in order to preserve test set
df_rf_data=pd.merge(df_prod_orders,data_set, how= 'right',left_on="order_id", right_on='order_id')
df_rf_data.head()


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,days_since_prior_order,eval_set,order_dow,order_from_last,order_hour_of_day,order_number
0,7,34050.0,1.0,0.0,142903,30.0,prior,2,3,14,11
1,7,46802.0,2.0,0.0,142903,30.0,prior,2,3,14,11
2,14,20392.0,1.0,1.0,18194,3.0,prior,3,4,15,49
3,14,27845.0,2.0,1.0,18194,3.0,prior,3,4,15,49
4,14,162.0,3.0,1.0,18194,3.0,prior,3,4,15,49


In [15]:
#does not have reorder information will have to use on test set
# will have to create reorder column to use in scoring
extra_data=df_rf_data[df_rf_data['order_from_last']==1]

In [16]:
df_rf_data.nunique()

order_id                  824836
product_id                 47702
add_to_cart_order            121
reordered                      2
user_id                   206209
days_since_prior_order        31
eval_set                       3
order_dow                      7
order_from_last                4
order_hour_of_day             24
order_number                 100
dtype: int64

In [17]:
#designate a training to consist of previous orders 4 and add column to track 
df_rf_train=df_rf_data[ (df_rf_data['order_from_last']==4 )]
df_rf_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,days_since_prior_order,eval_set,order_dow,order_from_last,order_hour_of_day,order_number
2,14,20392.0,1.0,1.0,18194,3.0,prior,3,4,15,49
3,14,27845.0,2.0,1.0,18194,3.0,prior,3,4,15,49
4,14,162.0,3.0,1.0,18194,3.0,prior,3,4,15,49
5,14,2452.0,4.0,1.0,18194,3.0,prior,3,4,15,49
6,14,8575.0,5.0,1.0,18194,3.0,prior,3,4,15,49


In [18]:
#creat validation set to track data
df_rf_validate=df_rf_data[(df_rf_data['order_from_last']==3) ]
df_rf_validate.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,days_since_prior_order,eval_set,order_dow,order_from_last,order_hour_of_day,order_number
0,7,34050.0,1.0,0.0,142903,30.0,prior,2,3,14,11
1,7,46802.0,2.0,0.0,142903,30.0,prior,2,3,14,11
13,16,9755.0,1.0,1.0,174840,13.0,prior,3,3,12,18
14,16,25466.0,2.0,0.0,174840,13.0,prior,3,3,12,18
15,16,45437.0,3.0,0.0,174840,13.0,prior,3,3,12,18


In [19]:
# creat test set and label
df_rf_test=df_rf_data[(df_rf_data['order_from_last']==2)]
df_rf_test.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,days_since_prior_order,eval_set,order_dow,order_from_last,order_hour_of_day,order_number
30,25,9755.0,1.0,1.0,59897,25.0,prior,6,2,10,19
31,25,31487.0,2.0,0.0,59897,25.0,prior,6,2,10,19
32,25,37510.0,3.0,1.0,59897,25.0,prior,6,2,10,19
33,25,14576.0,4.0,1.0,59897,25.0,prior,6,2,10,19
34,25,22105.0,5.0,0.0,59897,25.0,prior,6,2,10,19


In [20]:
print(df_rf_test.size)
print(df_rf_validate.size)
print(df_rf_train.size)


23537668
23264186
23039357


In [21]:
#capture feature names and make a list
features=df_rf_data[['add_to_cart_order','days_since_prior_order','order_dow','order_from_last','order_hour_of_day','order_number']]
feature_list = list(features.columns)
features=features.columns
features

Index([u'add_to_cart_order', u'days_since_prior_order', u'order_dow',
       u'order_from_last', u'order_hour_of_day', u'order_number'],
      dtype='object')

In [22]:
feature_list

['add_to_cart_order',
 'days_since_prior_order',
 'order_dow',
 'order_from_last',
 'order_hour_of_day',
 'order_number']

In [23]:
#convert target column into array
y = pd.factorize(df_rf_train['reordered'])[0]
y

array([0, 0, 0, ..., 1, 0, 0])

In [24]:
#convert target column into array
y_test = pd.factorize(df_rf_test['reordered'])[0]
y_test

array([0, 1, 0, ..., 1, 1, 1])

In [27]:
#seed random generator
np.random.seed(42)

In [28]:
#create classifier
clf = RandomForestClassifier(n_jobs=2, random_state=42)

In [29]:
df_rf_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094487 entries, 2 to 6349193
Data columns (total 11 columns):
order_id                  int64
product_id                float64
add_to_cart_order         float64
reordered                 float64
user_id                   int64
days_since_prior_order    float64
eval_set                  object
order_dow                 int64
order_from_last           int64
order_hour_of_day         int64
order_number              int64
dtypes: float64(4), int64(6), object(1)
memory usage: 191.8+ MB


In [30]:
#check data for NAN
df_rf_train.isnull().any()

order_id                  False
product_id                False
add_to_cart_order         False
reordered                 False
user_id                   False
days_since_prior_order     True
eval_set                  False
order_dow                 False
order_from_last           False
order_hour_of_day         False
order_number              False
dtype: bool

In [31]:
#fill in Nan with zero
df_rf_train=df_rf_train.fillna(0)

In [32]:
df_rf_train.isnull().any()

order_id                  False
product_id                False
add_to_cart_order         False
reordered                 False
user_id                   False
days_since_prior_order    False
eval_set                  False
order_dow                 False
order_from_last           False
order_hour_of_day         False
order_number              False
dtype: bool

In [33]:
y.shape

(2094487,)

In [34]:
###

## Concern:

I've been trying to make a good decision on how to handle the data. If I want to one hot encode all the product_ids then I need to reduce the customer base for memory limit uses. My reason for thinking this is to compare the against the single vector decomposition I started with. I'm thinking now, I should disregard that line of evaluation and build on this random forest with better feature columns. I would then have to adjust the SVM to be similiar.

Another issue with data wrangling is deciding how to preserve the time component of order history of the customer. I originally sought to segregate the data by breaking it into chunks by order: the test data the last order the validation set the customer's 2nd previous order, and the training data the customer's 3rd previous order (you see that above). Below you see my final decision, lumping the 3rd and 4th previous orders together into the training set, spliting on it to cross validate. Then testing on the 2nd to previous order as the test set. (The last order for each customer did not all contain labels to test against, hence I choose to move a step further down the customer's order history timeline.) 
 
This is one reason my data wrangling paper has been delayed. I've been trying to decide how to set up the RF, and SVM to handle it. 

## Question:

Does my approach at cross-validation voilate any principles? Do you see anything wrong here? Can I do it better?


The after this section only cross validates on the train set which above we made customer orders 4 peroids(or for orders from the original last customer order). We did this in an attempt to preserve the time history of customer orders. Maybe we should not. Maybe we should lump the previous 4th and 3rd orders together into one set then use the train_test_split module to cross validate. The set aside test data is the previous 2nd order, which we will reserve for testing the tuned parameters. I've tried to distinguish the spliting of the training data into a validation set by using xval and yval. 

In [35]:
#set the training data to just the 4th and 3rd previous orders
data_set_train=pd.concat([data_p3,data_p4])

In [36]:
#reset index 
data_set_train=data_set.reset_index()

In [37]:
#add reorder information to training data
rf_train=pd.merge(df_prod_orders,data_set_train, how= 'inner',left_on="order_id", right_on='order_id')
rf_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,index,user_id,days_since_prior_order,eval_set,order_dow,order_from_last,order_hour_of_day,order_number
0,7,34050,1,0,555320,142903,30.0,prior,2,3,14,11
1,7,46802,2,0,555320,142903,30.0,prior,2,3,14,11
2,14,20392,1,1,636820,18194,3.0,prior,3,4,15,49
3,14,27845,2,1,636820,18194,3.0,prior,3,4,15,49
4,14,162,3,1,636820,18194,3.0,prior,3,4,15,49


In [38]:
#fill in Nan with zero
rf_train=df_rf_train.fillna(0)

In [39]:
#check to make sure NaN is taken care of 
rf_train.isnull().any()

order_id                  False
product_id                False
add_to_cart_order         False
reordered                 False
user_id                   False
days_since_prior_order    False
eval_set                  False
order_dow                 False
order_from_last           False
order_hour_of_day         False
order_number              False
dtype: bool

Data is now in the format wrangling is complete. We shall set up the data for use in sklearn. We will need to choose the features to consider and separate the labels. 

In [40]:
#capture feature names and make a list
features=rf_train[['add_to_cart_order','days_since_prior_order','order_dow','order_from_last','order_hour_of_day','order_number']]
feature_list = list(features.columns)
features=features.columns
features

Index([u'add_to_cart_order', u'days_since_prior_order', u'order_dow',
       u'order_from_last', u'order_hour_of_day', u'order_number'],
      dtype='object')

In [41]:
#convert target column into array for training set
ytrain_rf = pd.factorize(rf_train['reordered'])[0]
ytrain_rf

array([0, 0, 0, ..., 1, 0, 0])

In [46]:
#convert target column into array for test set 
ytest_rf = pd.factorize(df_rf_test['reordered'])[0]
ytest_rf

array([0, 1, 0, ..., 1, 1, 1])

In [48]:
#see if the training data has the proper balance of reorders 
print ('% reorders in training set {:2f}'.format(float(ytrain_rf.sum())/ytrain_rf.size))
print ('% reorders in test set {:2f}'.format(float(ytest_rf.sum())/ytest_rf.size))

% reorders in training set 0.487340
% reorders in test set 0.413684


## Question
Should I be concerned about the difference here in imbalance here? The EDA of all the data showed that reorders occured at 60% overall. Here in the training set, which is all the 3rd and 4th previous orders for customers the reorder rate is 50 and the test set is 40%. 

In [49]:
#make a training set for sklearn features
xtrain_rf = rf_train[features]

In [50]:
#make a test set for sklearn features
xtest_rf = df_rf_test[features]

We have the data set up for sklearn lets do some machine learning. First will shall tune the parameters of the random forest classifier. Using the best parameters from cross validation, retrain the model on the training data and predict on the test data.

In [51]:
#know lets tune the parameters for the random forest

#set parametets to tune in Random Grid Search (# trees in forest and # of features per tree)
parameters={'n_estimators':range(1,30),'max_features':['auto',1,2,3,4,5]}

#create an instance of the random forest classifier
rf=RandomForestClassifier()

#randomize grid search for parameters with cross validation and refit on entire training set
#spread across two jobs and seed a random state
rs = RandomizedSearchCV(rf, param_distributions=parameters,cv=5,refit=True,n_jobs=2,random_state=42,scoring='f1')

#fit grid search on random forest over grid
rs.fit(xtrain_rf,ytrain_rf)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], 'max_features': ['auto', 1, 2, 3, 4, 5]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='f1', verbose=0)

## Question: 
How do you pick the appropriate range for # of trees? I understand the range how to pick the range of max_features. Should I also tune the gini purity or the minimum leaf node parameter? (When do you do that? When the model is overfitting, I assume.)

## Concern:
Need to expand range on number of trees since best result was at the boundry of test ranges. (Will it always take the max here? Is it overfitting? )

In [54]:
#save best parameters to use on test set
best_n = rs.best_params_['n_estimators'] 
best_features=rs.best_params_['max_features'] 

#print best parameter results
print("the best number of trees is {} with a maximum number of features of {}".format(best_n,best_features))
print("the best f1 score with these is {}".format(rs.best_score_))

the best number of trees is 29 with a maximum number of features of 4
the best f1 score with these is 0.646630425313


Parameters are tuned. We have 29 trees in the forest each using 4 out of the 6 features to learn. (need to expand in future). Next we will retrain on the training data with these parameters and predict on the test data. 

In [55]:
#create classifier from best parameters out of grid search
clf=rs.best_estimator_
#fit classifier to validation set
clf.fit(xtrain_rf,ytrain_rf)

#get the prediction set for the validation 
preds_train=clf.predict(xtrain_rf)

#make some predictions
preds=clf.predict(xtest_rf)


#print the accuracy score for the training data
training_accuracy = clf.score( xtrain_rf , ytrain_rf )
print("Accuracy on training data: {:0.2f}".format(training_accuracy))

#print the accuracy score for the test data
test_accuracy = clf.score(xtest_rf , ytest_rf)
print("Accuracy on test data: {:0.2f}".format(test_accuracy))

#print training f1 score
fscore_train = metrics.f1_score(ytrain_rf, preds_train)
print("F1 Score on train data: {:0.4f}".format(fscore_train))

#print testing f1 score
fscore = metrics.f1_score(ytest_rf, preds,pos_label=1)
print("F1 Score on validation (train-test set) data: {:0.4f}".format(fscore))


print (classification_report(ytest_rf,preds))

Accuracy on training data: 0.86
Accuracy on test data: 0.61
F1 Score on train data: 0.8542
F1 Score on validation (train-test set) data: 0.5025


NameError: name 'classification_report' is not defined

## Concern:
The Kaggle competition best score was around .409. I have my output slightly different. How can I be doing so much better with such little feature engineering?

Model is overfitting on training data (.86 compared to .61). Could look at tuning the gini-index (or leaf node min). 

In [None]:
#look at confusion matrix
confusion_matrix(ytest_rf,preds)

Next lets take a look at which features the model used for prediciton the most. 

In [None]:
#make a list of feature and feature importance
list(zip(rf_train[features], clf.feature_importances_))

In [None]:
#Capture model feature importance 
importances=clf.feature_importances_

In [None]:
#plot the variable importances
# Set the style
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');
plt.tight_layout

Now lets look at the ROC and evaluations of our model

In [None]:
#predict the probabilities for each observation in training set
pred_train_prob=clf.predict_proba(xtrain_rf)[:,1]

#predict the probabilities for each observation in test set
pred_prob=clf.predict_proba(xtest_rf)[:,1]

#get false positive rate, true positive rate, and thresholds for training data
xfpr,xtpr,xthres=roc_curve(ytrain_rf,pred_train_prob)

#get false positive rate, true positive rate, and thresholds for test data
fpr,tpr,thres=roc_curve(ytest_rf,pred_prob)

#plot base line at .5 probability
plt.plot([0,1],[0,1],'k--')
#plot model train and test
plt.plot(fpr,tpr,label='testing data')
plt.plot(xfpr,xtpr, 'g-',label='training data')
#add labels
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title('ROC Curve')
plt.legend(loc=4)
plt.show()

In [None]:

xprec_score,xrecall_score,xthresholds=precision_recall_curve(ytrain_rf, pred_train_prob)

prec_score,recall_score,thresholds=precision_recall_curve(ytest_rf, pred_prob)

plt.plot([1,0],[0,1],'k--')
plt.plot(recall_score,prec_score,label='testing data')
plt.plot(xrecall_score,xprec_score,'g-',label='training data')
#plt.plot(xfpr,xtpr, 'g-',label='training data')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title('Precision-Recall Curve')
plt.legend(loc=3)
plt.show()

In [None]:
roc_auc_score(ytest_rf, pred_prob)

In [None]:
cv_auc_scores=cross_val_score(clf, xtrain_rf,ytrain_rf, cv=5, scoring='roc_auc')
print(cv_auc_scores)

# can ignore stuff below. 
working on the how to split data differently.

In [None]:
#### below is spliting data in 

##### below if I need to reduce the data size

rf_small_train=df_rf_train.sample(750)

rf_small_table=rf_small_train.pivot(index='user_id', columns='product_id', values='reordered')

rf_small_table=rf_small_table.replace(0, 1, regex=True)
rf_small_table=rf_small_table.fillna(0)

rf_small_train=rf_small_train.set_index('user_id')

rf_small_train.head()

rf_small_sample=pd.merge(rf_small_train,rf_small_table, how= 'inner',left_index=True, right_index=True)

rf_small_sample.isnull().any()

rf_small_sample['reordered'].size

small_features=rf_small_sample.drop(['reordered','eval_set'],axis=1)
sm_feature_list = list(small_features.columns)
sm_features=small_features.columns
len(sm_feature_list)

small_features['order_id'].size

#convert target column into array
target= pd.factorize(rf_small_sample['reordered'])[0]

small_features.head()

small_features.size

clf.fit(small_features, target)

small_features.nunique()

small_features=small_features.reset_index()
small_users=pd.DataFrame(small_features['user_id'])

small_users.nunique()

small_products=pd.DataFrame(small_features['product_id'])
small_products.head()

## work on test data for predict
should i just do a train test split

need to one hotencode here for predict to be the same as model features

rf_small_sample=pd.merge(df_rf_test,small_users, how= 'inner',on='user_id')

rf_small_sample=pd.merge(rf_small_sample,small_products, how= 'inner',on='product_id')

rf_small_sample.head()

rf_small_sample.isnull().any()

rf_small_sample=rf_small_sample.reset_index('user_id')

rf_small_sample.product_id.size

rf_small_sample.nunique()

test_small_table=rf_small_sample.pivot(index='user_id',columns='product_id', values='reordered')

test_small_table=test_small_table.replace(0, 1, regex=True)
test_small_table=rf_small_table.fillna(0)

test_small_table.head()

#rf_small_train=rf_small_train.set_index('user_id')

#join one hot encode back with other features
df_rf_test

In [None]:
#get data from test set on user_ids
#small_test=df_rf_test.merge(small_users, how='inner', on='user_id')

sm_test_features=small_test.drop(['reordered','eval_set'],axis=1)
sm_test_features.columns

small_test[small_test['user_id']==71]

sm_preds=clf.predict(sm_test_features)

clf.predict_proba(df_rf_test[features])[0:10]

#make a table of the predictions vs. the actual reorders
pd.crosstab(df_rf_test['reordered'], preds, rownames=['Actual reorder'], colnames=['Predicted reorder'])

#make a list of feature and feature importance
list(zip(df_rf_train[features], clf.feature_importances_))

importances=clf.feature_importances_

#plot the variable importances
# Set the style
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');
plt.tight_layout