In [1]:
import pandas as pd 
import numpy as np

%matplotlib inline

## Order df

In [2]:
order = pd.read_csv('data/instacart/orders.csv')

In [3]:
order.drop(['eval_set', 'days_since_prior_order', 'order_number'], axis=1, inplace=True)
order.head()

Unnamed: 0,order_id,user_id,order_dow,order_hour_of_day
0,2539329,1,2,8
1,2398795,1,3,7
2,473747,1,3,12
3,2254736,1,4,7
4,431534,1,4,15


In [4]:
order['user_id'].nunique()

206209

In [5]:
order['order_id'].nunique()

3421083

## Product df

In [6]:
prod = pd.read_csv('data/instacart/products.csv')

In [7]:
prod.drop(['aisle_id', 'department_id'], axis=1, inplace=True)
prod.head()

Unnamed: 0,product_id,product_name
0,1,Chocolate Sandwich Cookies
1,2,All-Seasons Salt
2,3,Robust Golden Unsweetened Oolong Tea
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...
4,5,Green Chile Anytime Sauce


In [8]:
prod['product_id'].nunique()

49688

In [9]:
prod['product_name'].nunique()

49688

## Order Products df

In [10]:
df = pd.read_csv('data/instacart/order_products__train.csv')
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [11]:
df.drop(['add_to_cart_order', 'reordered'], axis=1, inplace=True)
df.head()

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633


In [12]:
df['product_id'].nunique()

39123

In [13]:
df['order_id'].nunique()

131209

## Merging

In [14]:
# Merge the dataframes based on the common column "product_id"
merged_df = pd.merge(df, prod, on='product_id', how='left')
merged_df.head()

Unnamed: 0,order_id,product_id,product_name
0,1,49302,Bulgarian Yogurt
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese
2,1,10246,Organic Celery Hearts
3,1,49683,Cucumber Kirby
4,1,43633,Lightly Smoked Sardines in Olive Oil


In [15]:
merged_df['order_id'].nunique()

131209

In [16]:
# Merge the dataframes based on the common column "order_id"
final_df = pd.merge(merged_df, order, on='order_id', how='left')
final_df.head()

Unnamed: 0,order_id,product_id,product_name,user_id,order_dow,order_hour_of_day
0,1,49302,Bulgarian Yogurt,112108,4,10
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,112108,4,10
2,1,10246,Organic Celery Hearts,112108,4,10
3,1,49683,Cucumber Kirby,112108,4,10
4,1,43633,Lightly Smoked Sardines in Olive Oil,112108,4,10


In [17]:
# Create the 'time' column by combining 'order_dow' and 'order_hour_of_day' with a hyphen
final_df['time'] = final_df['order_dow'].astype(str) + '-' + final_df['order_hour_of_day'].astype(str)
final_df.head()

Unnamed: 0,order_id,product_id,product_name,user_id,order_dow,order_hour_of_day,time
0,1,49302,Bulgarian Yogurt,112108,4,10,4-10
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,112108,4,10,4-10
2,1,10246,Organic Celery Hearts,112108,4,10,4-10
3,1,49683,Cucumber Kirby,112108,4,10,4-10
4,1,43633,Lightly Smoked Sardines in Olive Oil,112108,4,10,4-10


In [18]:
final_df.drop(['order_dow', 'order_hour_of_day'], axis=1, inplace=True)
final_df.head()

Unnamed: 0,order_id,product_id,product_name,user_id,time
0,1,49302,Bulgarian Yogurt,112108,4-10
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,112108,4-10
2,1,10246,Organic Celery Hearts,112108,4-10
3,1,49683,Cucumber Kirby,112108,4-10
4,1,43633,Lightly Smoked Sardines in Olive Oil,112108,4-10


In [19]:
final_df.shape

(1384617, 5)

In [20]:
final_df['order_id'].max()

3421070

In [21]:
final_df = final_df[final_df['order_id']<50000]

In [22]:
final_df.shape

(20648, 5)

## Groupby

In [23]:
time_product_df = pd.DataFrame(final_df.groupby('time')['product_name'].nunique().index)
time_product_df['users_count'] = final_df.groupby('time')['user_id'].nunique().values
time_product_df['items_count'] = final_df.groupby('time')['product_name'].nunique().values
time_product_df['items'] = final_df.groupby('time')['product_name'].unique().values
time_product_df.set_index('time', inplace=True)
time_product_df.head()

Unnamed: 0_level_0,users_count,items_count,items
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-0,5,51,"[Vinyl Glove Powder Free, Organic Whipping Cre..."
0-1,2,26,"[Corn De Maiz Blanco Tortillas, Pure Mint With..."
0-10,33,306,"[Sauvignon Blanc, English Seedless Cucumber, C..."
0-11,18,171,"[Granny Smith Apples, Organic Raw Unfiltered A..."
0-12,48,520,"[Clementines, Bag, Banana, Kids! Chewables Pro..."


In [24]:
import string

# Define a function to convert text to lowercase and remove punctuations
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

# Apply the preprocess_text function to the 'items' column
time_product_df['items'] = time_product_df['items'].apply(lambda items: [preprocess_text(item) for item in items])

In [25]:
time_product_df.head()

Unnamed: 0_level_0,users_count,items_count,items
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-0,5,51,"[vinyl glove powder free, organic whipping cre..."
0-1,2,26,"[corn de maiz blanco tortillas, pure mint with..."
0-10,33,306,"[sauvignon blanc, english seedless cucumber, c..."
0-11,18,171,"[granny smith apples, organic raw unfiltered a..."
0-12,48,520,"[clementines bag, banana, kids chewables probi..."


## Training

In [26]:
from apyori import apriori

In [27]:
# from mlxtend.frequent_patterns import apriori

In [28]:
transactions = time_product_df['items'].tolist()

In [29]:
rules = apriori(transactions = transactions, min_support=0.00030, min_confidance=0.01, min_lift=3, min_length=2, max_length=2)

In [30]:
#let's transform them into a list
results = list(rules)

In [31]:
def inspect(results):
    '''
    function to put the result in well organised pandas dataframe
    '''
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(lhs, rhs, supports, confidences, lifts))

In [32]:
resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['Item #1', 'Item #2', 'Support', 'Confidence', 'Lift'])
resultsinDataFrame.head()

Unnamed: 0,Item #1,Item #2,Support,Confidence,Lift
0,0 fat free organic milk,allergen reducer,0.006803,0.5,73.5
1,0 fat free organic milk,baked rice and corn puffs aged white cheddar,0.006803,0.5,18.375
2,0 fat free organic milk,bartlett pears,0.006803,0.5,5.25
3,0 fat free organic milk,bay leaves,0.006803,0.5,18.375
4,0 fat free organic milk,black bean salsa,0.006803,0.5,73.5
