# Food Review Dataset Preprocessing

In [2]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [3]:
import os
import sys

In [45]:
df_train=pd.read_csv("interactions_train.csv",usecols = [3,4,5])
df_test=pd.read_csv("interactions_test.csv",usecols = [3,4,5])
df_val=pd.read_csv("interactions_validation.csv",usecols = [3,4,5])

In [47]:
df_train.head(),df_test.head(),df_val.head()

(   rating      u       i
 0     5.0  22095   44367
 1     5.0  22095   87844
 2     5.0  24732  138181
 3     4.0  24732   93054
 4     5.0  22095  101723,
    rating   u       i
 0     4.0   2  173538
 1     4.0  16  177847
 2     0.0  26   89896
 3     2.0  45  172637
 4     3.0  52  177935,
    rating   u       i
 0     4.0   5  177317
 1     4.0  23  170785
 2     4.0  31  165555
 3     4.0  44  177453
 4     5.0  45  142367)

In [48]:
df_train = pd.concat([df_train,df_test,df_val],axis=0)
df_train.head()

Unnamed: 0,rating,u,i
0,5.0,22095,44367
1,5.0,22095,87844
2,5.0,24732,138181
3,4.0,24732,93054
4,5.0,22095,101723


In [49]:
len(df_train),len(df_test),len(df_val)


(718379, 12455, 7023)

In [50]:
df_train.reset_index(inplace=True,drop=True)

In [51]:
df_train.columns= ['Rating','User_Id','Recipe_Id']
df_train.head()

Unnamed: 0,Rating,User_Id,Recipe_Id
0,5.0,22095,44367
1,5.0,22095,87844
2,5.0,24732,138181
3,4.0,24732,93054
4,5.0,22095,101723


In [52]:
df_train["Rating"].value_counts()

Rating
5.0    530417
4.0    131846
3.0     27058
0.0     18000
2.0      7336
1.0      3722
Name: count, dtype: int64

In [53]:
df_train["Rating"].describe()

count    718379.000000
mean          4.564495
std           0.972494
min           0.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: Rating, dtype: float64

In [54]:
df_train["Rating"].isnull().sum()

0

## Filteration Criteria

(Based on NEtflix Preprocessing)
</br>
1. Only keep Rows where user has provided rating more than 4 (4 and 5)
2. Only keep data about recipe which were atleast reviewed once(or user provided input)
3. Only keep data  users  whol reviewd atleast five recipes(or user provided input)

In [56]:
#1.binarize the data(Only Keep rating 5)
df_train=df_train[df_train["Rating"]>=5]

In [57]:
len(df_train)

530417

In [58]:
df_train.head()

Unnamed: 0,Rating,User_Id,Recipe_Id
0,5.0,22095,44367
1,5.0,22095,87844
2,5.0,24732,138181
4,5.0,22095,101723
5,5.0,22095,134551


In [59]:
def get_count(tp, id):
    """ 
    Acts very similar to value counts for particular column and \
    can be used filtering the indexes by triplet fucntion
    """
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp,min_uc=5,min_sc=0):
    """
    FIltering based on count of recipe reviewed by user and count users who reviewed the certain amount of recipe
    """
    # Only keep the triplets for items which were rated at least min_sc times
    if min_sc > 0:
        itemcount = tp["Recipe_Id"].value_counts()
        tp = tp[tp["Recipe_Id"].isin(itemcount.index[itemcount >= min_sc])]
    # Only keep the triplets for users who have rated at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = tp["User_Id"].value_counts()
        tp = tp[tp["User_Id"].isin(usercount.index[usercount >= min_uc])]
    # Update both usercount and itemcount after filtering
    usercount, itemcount = tp["User_Id"].value_counts(), tp["Recipe_Id"].value_counts() 
    return tp, usercount, itemcount

In [60]:
raw_data, user_activity, item_popularity = filter_triplets(df_train, min_uc=5, min_sc=0)

In [61]:
len(user_activity), df_train["User_Id"].nunique(), len(item_popularity), df_train["Recipe_Id"].nunique()

(13353, 24417, 139354, 142227)

In [62]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d reviewing events from %d users and %d recipes (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 502382 reviewing events from 13353 users and 139354 recipes (sparsity: 0.027%)


In [28]:
unique_uid = user_activity.index
np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,76535,33627,2005-02-15,4.0,5,177317
1,160497,75307,2005-10-24,4.0,23,170785
2,930021,100961,2008-11-30,4.0,31,165555
3,58439,154105,2007-03-24,4.0,44,177453
4,628951,14525,2008-02-16,5.0,45,142367
