In [1]:
import pandas as pd
import numpy as np

Lets do some data wraggling. We will need to set up martrices to execute a SVD, R=UΣV^T. Normally this is associated with recommender based on ratings. Here, we will be using the reorder proportions inplace of ratings to predict future reorders. R will be the user reorder matrix. U is user product feature matrix.  Σ is the singular value matrix. V^T is the product feature matrix. 

In [2]:
#loading user order information
instacart_file=pd.read_csv('Data/orders.csv')
df_orders=pd.DataFrame(instacart_file,)
df_orders.head()


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [3]:
#loading product reorder information
instacart_file2=pd.read_csv('Data/order_products__prior.csv')
df_prod_orders=pd.DataFrame(instacart_file2,)
df_prod_orders.head()


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [4]:
#loading information for product names
instacart_products=pd.read_csv('Data/products.csv')
df_prod=pd.DataFrame(instacart_products,)
df_prod.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [5]:
#merge dataframes to get user_id with product_id and reorder in same dataframe
df_user_order_prod=pd.merge(df_prod_orders,df_orders, how= 'outer',left_on="order_id", right_on='order_id')


In [6]:
df_user_order_prod.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120.0,1.0,1.0,202279,prior,3,5,9,8.0
1,2,28985.0,2.0,1.0,202279,prior,3,5,9,8.0
2,2,9327.0,3.0,0.0,202279,prior,3,5,9,8.0
3,2,45918.0,4.0,1.0,202279,prior,3,5,9,8.0
4,2,30035.0,5.0,0.0,202279,prior,3,5,9,8.0


We will want to get reorder rates for user by product.

In [7]:
#count number of product purchases by user
user_products_total=df_user_order_prod.groupby(['user_id','product_id']).size()
user_products_total.head()

user_id  product_id
1        196.0         10
         10258.0        9
         10326.0        1
         12427.0       10
         13032.0        3
dtype: int64

In [8]:
#count number of reorders for user by product
user_item_reorders=df_user_order_prod['reordered'].groupby([df_user_order_prod['user_id'],df_user_order_prod['product_id']]).sum()
user_item_reorders.head()

user_id  product_id
1        196.0         9.0
         10258.0       8.0
         10326.0       0.0
         12427.0       9.0
         13032.0       2.0
Name: reordered, dtype: float64

In [9]:
#calculate reorder rate for user by product
user_item_reorder_rate=user_item_reorders/user_products_total
user_item_reorder_rate.rename(columns={0:'reorder_rate'})
user_item_reorder_rate.head()

user_id  product_id
1        196.0         0.900000
         10258.0       0.888889
         10326.0       0.000000
         12427.0       0.900000
         13032.0       0.666667
dtype: float64

In [10]:
#move series into dataframe and rename columns
df_upr=pd.DataFrame(user_products_total,columns=['prod_order_count'])
df_ur=pd.DataFrame(user_item_reorder_rate,columns=['prod_reorder_rate'])
print(df_upr.head())
print(df_ur.head())
#pd.merge(df_upr.reset_index(), df_ur.reset_index(), on=['user_id'], how='inner').set_index(['user_id','product_id'])

                    prod_order_count
user_id product_id                  
1       196.0                     10
        10258.0                    9
        10326.0                    1
        12427.0                   10
        13032.0                    3
                    prod_reorder_rate
user_id product_id                   
1       196.0                0.900000
        10258.0              0.888889
        10326.0              0.000000
        12427.0              0.900000
        13032.0              0.666667


In [11]:
#join into singe dataframe
df_yes=pd.concat([df_upr, df_ur], axis=1)


In [12]:
df_yes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,prod_order_count,prod_reorder_rate
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,196.0,10,0.9
1,10258.0,9,0.888889
1,10326.0,1,0.0
1,12427.0,10,0.9
1,13032.0,3,0.666667


We want products to be the columns, user_id the rows, and the values to be the reorder rate. This will be R, user reorder matrix, for SVD. (after we normalize)

In [None]:
#reset dataframe in order to pivot product_id to columns, user_id to index, and reorder rate to values.
df_reorders=df_yes.reset_index().pivot(index='user_id', columns='product_id', values='prod_reorder_rate')
df_reorders.head()

product_id,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,49679.0,49680.0,49681.0,49682.0,49683.0,49684.0,49685.0,49686.0,49687.0,49688.0
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,0.0,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [None]:
#fill NaN with 0 
df_reorders.fillna(0)