In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import scipy as sp

Lets do some data wraggling. We will need to set up martrices to execute a SVD, R=UΣV^T. Normally this is associated with a recommender based on ratings. Here, we will be using the reorder proportions inplace of ratings to predict future reorders. R will be the user reorder matrix. U is user product feature matrix.  Σ is the singular value matrix. V^T is the product feature matrix. 

In [2]:
#loading user order information
instacart_file=pd.read_csv('Data/orders.csv')
df_orders=pd.DataFrame(instacart_file,)
df_orders.head(15)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


Note that the column 'eval_set' breaks down the data into three sets. Details in readme file. What is important to note is that reorders is not provided for the test set (loading into the next dataframe). We there will not be using that set of data. We will add data from the prior set into the train set for are data set. The augmented data set will then be feed into scikit learn model to segregate data into a new train set,cross-validation set, and test set. 

In [3]:
#build data set 
user_order_max=df_orders['order_number'].groupby(df_orders['user_id']).max()
user_order_max.head()

user_id
1    11
2    15
3    13
4     6
5     5
Name: order_number, dtype: int64

In [4]:
user_order_max.size

206209

In [5]:
df_orders.nunique()

order_id                  3421083
user_id                    206209
eval_set                        3
order_number                  100
order_dow                       7
order_hour_of_day              24
days_since_prior_order         31
dtype: int64

In [6]:
#capture the last order information for each user
g = df_orders.groupby('user_id')
data_p1=g.last()

In [7]:
#capture the second to last order information for each user
data_p2=g.nth(-2)

In [8]:
#capture the third to last order information for each user
data_p3=g.nth(-3)

In [9]:
#capture the fourth to last order information for each user
data_p4=g.nth(-4)

In [10]:
#join all information into one data set
data_set=pd.concat([data_p1,data_p2,data_p3,data_p4])

In [11]:
data_set.groupby(['user_id','order_number','eval_set','order_id']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,days_since_prior_order,order_dow,order_hour_of_day
user_id,order_number,eval_set,order_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,8,prior,3108588,1,1,1
1,9,prior,2295261,1,1,1
1,10,prior,2550362,1,1,1
1,11,train,1187899,1,1,1
2,12,prior,3186735,1,1,1
2,13,prior,3268552,1,1,1
2,14,prior,839880,1,1,1
2,15,train,1492625,1,1,1
3,10,prior,676467,1,1,1
3,11,prior,521107,1,1,1


In [12]:
#remove test set from data
data_set=data_set[data_set.eval_set != 'test']

In [13]:
#set aside new test set
test_set=data_set[data_set.eval_set == 'train']
test_set.nunique()

days_since_prior_order        31
eval_set                       1
order_dow                      7
order_hour_of_day             24
order_id                  131209
order_number                  97
dtype: int64

In [14]:
#assign training set
train_set=data_set[data_set.eval_set == 'prior']
train_set.reset_index(inplace=True)
train_set.nunique()

user_id                   206209
days_since_prior_order        31
eval_set                       1
order_dow                      7
order_hour_of_day             24
order_id                  618627
order_number                  99
dtype: int64

This set is still a bit to big so lets sample a more reasonable portion of users. We will go with 75,ooo users instead. 

In [15]:
train_set=train_set.sample(750)

In [16]:
train_set.reset_index(inplace=True)
train_set.head()

Unnamed: 0,index,user_id,days_since_prior_order,eval_set,order_dow,order_hour_of_day,order_id,order_number
0,246074,39866,4.0,prior,6,9,1118335,10
1,425792,13375,6.0,prior,6,17,11986,3
2,431696,19279,30.0,prior,1,14,2789668,3
3,20326,20327,30.0,prior,4,13,1611634,6
4,516149,103732,5.0,prior,3,12,52616,47


In [17]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 8 columns):
index                     750 non-null int64
user_id                   750 non-null int64
days_since_prior_order    719 non-null float64
eval_set                  750 non-null object
order_dow                 750 non-null int64
order_hour_of_day         750 non-null int64
order_id                  750 non-null int64
order_number              750 non-null int64
dtypes: float64(1), int64(6), object(1)
memory usage: 46.9+ KB


In [18]:
train_set.nunique()

index                     750
user_id                   750
days_since_prior_order     31
eval_set                    1
order_dow                   7
order_hour_of_day          23
order_id                  750
order_number               74
dtype: int64

# SVD
Now that the data is set up. Lets set up the mechanics for SVD. In short we will need to wraggle our data into dataframes to feed into scripy model. Recall the basic set up is   R=UΣV^T. R we want to be user_id as the index, product_id as the column and reorder rates for users by product as the value.  

In [19]:
#loading product reorder information
instacart_file2=pd.read_csv('Data/order_products__prior.csv')
df_prod_orders=pd.DataFrame(instacart_file2,)
df_prod_orders.head()


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [20]:
#loading information for product names
instacart_products=pd.read_csv('Data/products.csv')
df_prod=pd.DataFrame(instacart_products,)
df_prod.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [21]:
#merge dataframes to get user_id with product_id and reorder in same dataframe
#use inner to get the intersection in order to preserve test set
df_user_order_prod=pd.merge(df_prod_orders,train_set, how= 'inner',left_on="order_id", right_on='order_id')


In [22]:
df_user_order_prod.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,index,user_id,days_since_prior_order,eval_set,order_dow,order_hour_of_day,order_number
0,9371,5364,1,0,427571,15154,20.0,prior,6,9,12
1,9371,18135,2,0,427571,15154,20.0,prior,6,9,12
2,9371,14897,3,0,427571,15154,20.0,prior,6,9,12
3,9371,17634,4,0,427571,15154,20.0,prior,6,9,12
4,9371,30551,5,0,427571,15154,20.0,prior,6,9,12


In [23]:
df_user_order_prod.nunique()

order_id                   750
product_id                3885
add_to_cart_order           45
reordered                    2
index                      750
user_id                    750
days_since_prior_order      31
eval_set                     1
order_dow                    7
order_hour_of_day           23
order_number                74
dtype: int64

We will want to get reorder rates for user by product.

In [24]:
#count number of product purchases by user
user_products_total=df_user_order_prod.groupby(['user_id','product_id']).size().sort_values(ascending=False)
user_products_total.head()

user_id  product_id
205460   45007         1
64695    10481         1
65302    24964         1
         25801         1
         31506         1
dtype: int64

In [25]:
#count number of reorders for user by product
user_item_reorders=df_user_order_prod['reordered'].groupby([df_user_order_prod['user_id'],df_user_order_prod['product_id']]).sum().sort_values(ascending=False)
user_item_reorders.head()

user_id  product_id
205460   45007         1
87717    32864         1
         12572         1
         20995         1
         22713         1
Name: reordered, dtype: int64

In [26]:
#calculate reorder rate for user by product
user_item_reorder_rate=user_item_reorders/user_products_total
user_item_reorder_rate.rename(columns={0:'reorder_rate'})
user_item_reorder_rate.head()

user_id  product_id
85       8277          1.0
         12703         0.0
         13176         1.0
         21137         0.0
         21903         0.0
dtype: float64

In [27]:
#move series into dataframe and rename columns
df_upr=pd.DataFrame(user_products_total,columns=['prod_order_count'])
df_ur=pd.DataFrame(user_item_reorder_rate,columns=['prod_reorder_rate'])
print(df_upr.head())
print(df_ur.head())
#pd.merge(df_upr.reset_index(), df_ur.reset_index(), on=['user_id'], how='inner').set_index(['user_id','product_id'])

                    prod_order_count
user_id product_id                  
205460  45007                      1
64695   10481                      1
65302   24964                      1
        25801                      1
        31506                      1
                    prod_reorder_rate
user_id product_id                   
85      8277                      1.0
        12703                     0.0
        13176                     1.0
        21137                     0.0
        21903                     0.0


In [28]:
#join into singe dataframe
df_rates=pd.concat([df_upr, df_ur], axis=1)


In [29]:
df_rates.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,prod_order_count,prod_reorder_rate
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1
85,8277,1,1.0
85,12703,1,0.0
85,13176,1,1.0
85,21137,1,0.0
85,21903,1,0.0
85,23165,1,0.0
85,32177,1,0.0
85,34243,1,0.0
85,35547,1,0.0
85,40146,1,1.0


We want products to be the columns, user_id the rows, and the values to be the reorder rate. This will be R, user reorder matrix, for SVD. (after we normalize)

In [30]:
#reset dataframe in order to pivot product_id to columns, user_id to index, and reorder rate to values.
df_reorders=df_rates.reset_index().pivot(index='user_id', columns='product_id', values='prod_reorder_rate')
df_reorders.head()

product_id,34,37,45,54,79,95,117,120,130,148,...,49543,49544,49552,49556,49585,49598,49610,49621,49667,49683
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
85,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,,,,,,,,,,
431,,,,,,,,,,,...,,,,,,,,,,
1026,,,,,,,,,,,...,,,,,,,,,,
1105,,,,,,,,,,,...,,,,,,,,,,


In [31]:
#fill NaN with 0 
df_reorders=df_reorders.fillna(0)

In [32]:
df_reorders.info

<bound method DataFrame.info of product_id  34     37     45     54     79     95     117    120    130    \
user_id                                                                     
85            0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
97            0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
431           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1026          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1105          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1154          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1244          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1278          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1319          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1706          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2066          0.0    0.0    0.0    0.0    0.

In [33]:
df_reorders.head()

product_id,34,37,45,54,79,95,117,120,130,148,...,49543,49544,49552,49556,49585,49598,49610,49621,49667,49683
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
df_reorders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 750 entries, 85 to 205460
Columns: 3885 entries, 34 to 49683
dtypes: float64(3885)
memory usage: 22.2 MB


We will turn that dataframe into a matrix, normalize, optimize the parameters, and make some predictions. 

In [35]:
#normalize reorders in order to feed into scipy
reorders= df_reorders.as_matrix()
reorder_mean = np.mean(reorders, axis = 1)
reordered_normalized = reorders - reorder_mean.reshape(-1, 1)

In [36]:
#break down reorder matrix (R) into unitary matices 
# k picked at random will need to cross validate later
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(reordered_normalized, k = 10)

In [37]:
#make sigma a diagonal matrix for muliplication next
sigma = np.diag(sigma)

In [38]:
#multiple U by sigma then by V^t add means back in to get reconstruction of orginal matrix 
#then convert matrix to dataframe assigning columns and index
reconstructed_reorders = np.dot(np.dot(U, sigma), Vt) + reorder_mean.reshape(-1, 1)
df_predictions = pd.DataFrame(reconstructed_reorders, columns = df_reorders.columns,index=df_reorders.index)

In [39]:
print(df_predictions.index)

Int64Index([    85,     97,    431,   1026,   1105,   1154,   1244,   1278,
              1319,   1706,
            ...
            202716, 203226, 203252, 203683, 203764, 204010, 204274, 204618,
            204964, 205460],
           dtype='int64', name=u'user_id', length=750)


In [59]:
#save user list
users=list(df_predictions.index)

In [40]:
df_predictions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 750 entries, 85 to 205460
Columns: 3885 entries, 34 to 49683
dtypes: float64(3885)
memory usage: 22.2 MB


In [61]:
#look  to see how predictions look
df_predictions.loc[users[0]].sort_values(ascending=False)

product_id
13176    0.912247
8277     0.196048
27845    0.116438
45007    0.110566
4920     0.088915
21903    0.084724
17794    0.069233
49683    0.068495
48679    0.066439
27521    0.065577
21137    0.054928
30489    0.051219
45066    0.048072
22035    0.046895
28204    0.045984
19660    0.045758
31506    0.043732
40723    0.042623
5134     0.042409
5077     0.040837
5479     0.040620
19208    0.038833
30391    0.038830
27966    0.037336
22025    0.037318
43961    0.037290
49235    0.036395
31343    0.035694
44479    0.034493
33129    0.034377
           ...   
1559    -0.018788
10351   -0.018977
4605    -0.019152
26800   -0.020132
21295   -0.020453
29447   -0.020466
11123   -0.020716
24561   -0.020791
27336   -0.020795
42265   -0.021687
36735   -0.022613
24964   -0.022915
5456    -0.023272
16759   -0.024094
47626   -0.024693
38293   -0.025006
40545   -0.026232
46667   -0.027203
37158   -0.028713
1360    -0.029015
30639   -0.029848
32850   -0.030019
24852   -0.031630
27344   -0.033604

In [63]:
#pick only high predictons
prediction_thresholds=df_predictions.loc[users[0]].apply(lambda x: x if x > 0.05 else None)
prediction_thresholds.dropna().sort_values(ascending=False)

product_id
13176    0.912247
8277     0.196048
27845    0.116438
45007    0.110566
4920     0.088915
21903    0.084724
17794    0.069233
49683    0.068495
48679    0.066439
27521    0.065577
21137    0.054928
30489    0.051219
Name: 85, dtype: float64

In [47]:
def predicted_reorders(predictions_df, user_id, df_prod, df_rates, threshold=0.0):
    """ Function takes 5 parameters: prediction dataframe, user to predict reorders for, product dataframe,
        reorders rate per user dataframe, threshold from prediction dataframe to reach to make
        recommendation. 
        
        Function returns two:  user_purchased: items users has purchased in last three orders,
        prediction: items from user_purchased list that surpass prediction rate 
        
        note: threshold default is zero (returns product that SVD show to have any positive cosine similiarity). 
        Higher threshold settings will return more likely reorders.
    """
    
    # for user get products that surpass prediction theshold
    prediction_thresholds=df_predictions.loc[user_id].apply(lambda x: x if x > threshold else None)
    user_predictions = prediction_thresholds.dropna().sort_values(ascending=False)
    
    # dataframe for items user has purchased previously (last three orders) with name and reorder rate
    df_rates=df_rates.reset_index()
    user_data = df_rates[df_rates.user_id == (user_id)]
    user_purchased = (user_data.merge(df_prod, how = 'left', left_on = 'product_id', right_on = 'product_id').
                     sort_values(['prod_reorder_rate'], ascending=False))

    # Predict reorders by returning items from from previous purchases that surpass prediction rate
    prediction = user_purchased.merge(pd.DataFrame(user_predictions).reset_index(), how = 'inner',left_on = 'product_id',
               right_on = 'product_id')
    
    #format prediction dataframe to see prediction rate and sort by rate
    prediction=prediction.rename(columns = {user_id: 'Prediction'}).sort_values('Prediction', ascending = False)
    
    return user_purchased, prediction



In [64]:
user_purchased, prediction = predicted_reorders(df_predictions,users[0], df_prod, df_rates,.05)

In [65]:
user_purchased


Unnamed: 0,user_id,product_id,prod_order_count,prod_reorder_rate,product_name,aisle_id,department_id
0,85,8277,1,1.0,Apple Honeycrisp Organic,24,4
2,85,13176,1,1.0,Bag of Organic Bananas,24,4
9,85,40146,1,1.0,Almond Milk Hazelnut Creamer,91,16
1,85,12703,1,0.0,Organic Mung-Bean Sprouts,123,4
3,85,21137,1,0.0,Organic Strawberries,24,4
4,85,21903,1,0.0,Organic Baby Spinach,123,4
5,85,23165,1,0.0,Organic Leek,83,4
6,85,32177,1,0.0,Blood Oranges,24,4
7,85,34243,1,0.0,Organic Baby Broccoli,83,4
8,85,35547,1,0.0,Organic Baby Kale,83,4


In [66]:
prediction

Unnamed: 0,user_id,product_id,prod_order_count,prod_reorder_rate,product_name,aisle_id,department_id,Prediction
1,85,13176,1,1.0,Bag of Organic Bananas,24,4,0.912247
0,85,8277,1,1.0,Apple Honeycrisp Organic,24,4,0.196048
4,85,45007,1,0.0,Organic Zucchini,83,4,0.110566
3,85,21903,1,0.0,Organic Baby Spinach,123,4,0.084724
2,85,21137,1,0.0,Organic Strawberries,24,4,0.054928
