In [1]:
#Importing useful libraries
import pandas as pd
import numpy as np
import datetime
import chart_studio.plotly as py
import plotly.graph_objs as go 
from plotly.offline import init_notebook_mode,iplot,plot
init_notebook_mode(connected=True) 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
aisles=pd.read_csv('aisles.csv')
departments=pd.read_csv('departments.csv')
products=pd.read_csv('products.csv')
order_products_prior=pd.read_csv('order_products__prior.csv')
order_products_train=pd.read_csv('order_products__train.csv')
orders=pd.read_csv('orders.csv')

In [3]:
### FEATURE ENGINEERING

In [4]:
### Predicting which previously purchased products will be in a user’s next order

Building Features on prior data

In [5]:
order_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [6]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [7]:
#merge order_products_prior and orders dataframe

prior_df = pd.merge(order_products_prior, orders, how="inner", on="order_id")

In [8]:
prior_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [9]:
df_agg=prior_df.groupby('user_id').aggregate({'order_number':'max'}).reset_index()
df_agg

Unnamed: 0,user_id,order_number
0,1,10
1,2,14
2,3,12
3,4,5
4,5,4
...,...,...
206204,206205,3
206205,206206,67
206206,206207,16
206207,206208,49


In [10]:
df_latest= pd.merge(prior_df, df_agg, how="inner", on=['user_id', 'order_number'])
df_latest = df_latest[['user_id', 'product_id', 'reordered']]
df_latest.columns=["user_id", "product_id", "reordered_latest"]


In [11]:
df_latest.head()

Unnamed: 0,user_id,product_id,reordered_latest
0,59897,9755,1
1,59897,31487,0
2,59897,37510,1
3,59897,14576,1
4,59897,22105,0


In [12]:
df_1 = prior_df.groupby(["user_id","product_id"])["reordered"].aggregate(['count','sum','mean']).reset_index()
df_1.columns = ["user_id", "product_id", 'reordered_count', 'reordered_sum',"reordered_rate"]
df_1.head()

Unnamed: 0,user_id,product_id,reordered_count,reordered_sum,reordered_rate
0,1,196,10,9,0.9
1,1,10258,9,8,0.888889
2,1,10326,1,0,0.0
3,1,12427,10,9,0.9
4,1,13032,3,2,0.666667


In [13]:
df_merge = pd.merge(df_latest, df_1,how="left", on=["user_id","product_id"])

In [14]:
df_merge.head()

Unnamed: 0,user_id,product_id,reordered_latest,reordered_count,reordered_sum,reordered_rate
0,59897,9755,1,14,13,0.928571
1,59897,31487,0,1,0,0.0
2,59897,37510,1,2,1,0.5
3,59897,14576,1,13,12,0.923077
4,59897,22105,0,1,0,0.0


In [15]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


Adding additional information of aisle_id and department_id to df_merge

In [16]:
order_products_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


**finding unique order_id in order_products_train dataset**

In [17]:
train_df=pd.DataFrame(order_products_train['order_id'].unique())
train_df.columns=['order_id']
train_df

Unnamed: 0,order_id
0,1
1,36
2,38
3,96
4,98
...,...
131204,3421049
131205,3421056
131206,3421058
131207,3421063


In [18]:
#Joining train_df with orders to get user_id corresponding to order_id

train_df = pd.merge(train_df, orders, how="inner", on="order_id")
train_df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,112108,train,4,4,10,9.0
1,36,79431,train,23,6,18,30.0
2,38,42756,train,6,6,16,24.0
3,96,17227,train,7,6,20,30.0
4,98,56463,train,41,3,8,14.0


In [19]:
#merging the train_df and df_merge to get all the features

train_df = pd.merge(train_df, df_merge, how="inner", on="user_id")
train_df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,reordered_latest,reordered_count,reordered_sum,reordered_rate
0,1,112108,train,4,4,10,9.0,14947,1,3,2,0.666667
1,1,112108,train,4,4,10,9.0,5707,1,2,1,0.5
2,1,112108,train,4,4,10,9.0,44632,0,1,0,0.0
3,1,112108,train,4,4,10,9.0,30881,1,2,1,0.5
4,1,112108,train,4,4,10,9.0,43633,1,2,1,0.5


In [20]:
train_df.drop(['eval_set','order_number'], axis=1,inplace=True)

In [21]:
train_df.head()

Unnamed: 0,order_id,user_id,order_dow,order_hour_of_day,days_since_prior_order,product_id,reordered_latest,reordered_count,reordered_sum,reordered_rate
0,1,112108,4,10,9.0,14947,1,3,2,0.666667
1,1,112108,4,10,9.0,5707,1,2,1,0.5
2,1,112108,4,10,9.0,44632,0,1,0,0.0
3,1,112108,4,10,9.0,30881,1,2,1,0.5
4,1,112108,4,10,9.0,43633,1,2,1,0.5


In [22]:
#merging train_df with products dataframe
training= pd.merge(train_df, products, how="inner", on="product_id")
training.head()

Unnamed: 0,order_id,user_id,order_dow,order_hour_of_day,days_since_prior_order,product_id,reordered_latest,reordered_count,reordered_sum,reordered_rate,product_name,aisle_id,department_id
0,1,112108,4,10,9.0,14947,1,3,2,0.666667,Pure Sparkling Water,115,7
1,3740,144433,3,12,14.0,14947,1,4,3,0.75,Pure Sparkling Water,115,7
2,4194,186124,6,15,30.0,14947,1,4,3,0.75,Pure Sparkling Water,115,7
3,5460,40556,0,11,6.0,14947,1,4,3,0.75,Pure Sparkling Water,115,7
4,6485,57716,4,16,9.0,14947,1,5,4,0.8,Pure Sparkling Water,115,7


In [23]:
training.drop(['product_name','order_dow','order_hour_of_day','days_since_prior_order'],axis=1, inplace= True)

In [24]:
training.sort_values(by=['order_id']).head()

Unnamed: 0,order_id,user_id,product_id,reordered_latest,reordered_count,reordered_sum,reordered_rate,aisle_id,department_id
0,1,112108,14947,1,3,2,0.666667,115,7
4416,1,112108,43633,1,2,1,0.5,95,15
1111,1,112108,5707,1,2,1,0.5,3,19
1157,1,112108,44632,0,1,0,0.0,115,7
4390,1,112108,30881,1,2,1,0.5,95,15


In [25]:
#now merging traing_data with order_products_train to get reordered column
order_products_train.drop(['add_to_cart_order'],axis=1,inplace=True)

In [26]:
order_products_train.head()

Unnamed: 0,order_id,product_id,reordered
0,1,49302,1
1,1,11109,1
2,1,10246,0
3,1,49683,0
4,1,43633,1


In [27]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [28]:
train_y_df = pd.merge(order_products_train, orders, how="inner", on="order_id")
train_y_df = train_y_df[["user_id", "product_id", "reordered"]]
train_df = pd.merge(training, train_y_df, how="left", on=["user_id", "product_id"])
train_df["reordered"].fillna(0, inplace=True)

In [29]:
train_df.head()

Unnamed: 0,order_id,user_id,product_id,reordered_latest,reordered_count,reordered_sum,reordered_rate,aisle_id,department_id,reordered
0,1,112108,14947,1,3,2,0.666667,115,7,0.0
1,3740,144433,14947,1,4,3,0.75,115,7,1.0
2,4194,186124,14947,1,4,3,0.75,115,7,1.0
3,5460,40556,14947,1,4,3,0.75,115,7,0.0
4,6485,57716,14947,1,5,4,0.8,115,7,0.0


In [42]:
train_df.drop(['reordered_rate'],axis=1,inplace= True)

In [43]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1358988 entries, 0 to 1358987
Data columns (total 9 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   order_id          1358988 non-null  int64  
 1   user_id           1358988 non-null  int64  
 2   product_id        1358988 non-null  int64  
 3   reordered_latest  1358988 non-null  int64  
 4   reordered_count   1358988 non-null  int64  
 5   reordered_sum     1358988 non-null  int64  
 6   aisle_id          1358988 non-null  int64  
 7   department_id     1358988 non-null  int64  
 8   reordered         1358988 non-null  float64
dtypes: float64(1), int64(8)
memory usage: 103.7 MB


### Prediction and Evaluation

## Train-Test Split

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
X = train_df.drop('reordered',axis=1)
y = train_df['reordered']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

### Training a logistic regression model

In [46]:
from sklearn.linear_model import LogisticRegression

In [47]:
logmodel= LogisticRegression()
logmodel.fit(X_train,y_train)

LogisticRegression()

In [48]:
y_predicted=logmodel.predict(X_test)

In [49]:
from sklearn.metrics import classification_report

In [50]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

         0.0       0.72      1.00      0.84    294209
         1.0       1.00      0.00      0.00    113488

    accuracy                           0.72    407697
   macro avg       0.86      0.50      0.42    407697
weighted avg       0.80      0.72      0.60    407697

