In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
products_df = pd.read_csv("data\data_processed_gte25\products.csv")
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,water seltzer sparkling water,beverages
2,34,Peanut Butter Cereal,121,14,cereal,breakfast
3,45,European Cucumber,83,4,fresh vegetables,produce
4,79,Wild Albacore Tuna No Salt Added,95,15,canned meat seafood,canned goods


In [3]:
orders_df = pd.read_csv("data\data_processed_gte25\orders.csv")
orders_df.head()

Unnamed: 0,order_id,product_list,add_to_cart_order,order_dow,order_hour_of_day,length
0,2,33120,1,5,9,3
1,2,17794,2,5,9,3
2,2,9327,4,5,9,3
3,3,17668,1,5,17,5
4,3,24838,2,5,17,5


In [4]:
print("Shape of orders dataframe: ", orders_df.shape)
print("Shape of productss dataframe: ", products_df.shape)

Shape of orders dataframe:  (323705, 6)
Shape of productss dataframe:  (2965, 6)


In [5]:
products_df["product_id"]

0           1
1          10
2          34
3          45
4          79
        ...  
2960    49605
2961    49610
2962    49621
2963    49667
2964    49683
Name: product_id, Length: 2965, dtype: int64

In [6]:
products_df["aisle_id"].max()

134

In [7]:
products_df["department_id"].min()

1

In [8]:
products_df["department_id"].nunique()

21

In [9]:
orders_df["product_list"]

0         33120
1         17794
2          9327
3         17668
4         24838
          ...  
323700     7131
323701    25890
323702     7010
323703     3142
323704    18670
Name: product_list, Length: 323705, dtype: int64

In [10]:
# products_list in the orders_df and products_id in the products_df have same values, so renaming
orders_df.rename(columns= {'product_list': 'product_id'}, inplace=True)
orders_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,order_dow,order_hour_of_day,length
0,2,33120,1,5,9,3
1,2,17794,2,5,9,3
2,2,9327,4,5,9,3
3,3,17668,1,5,17,5
4,3,24838,2,5,17,5


In [11]:
# Merge the dataframes based on the common column "product_id"
merged_df = pd.merge(orders_df, products_df, on='product_id', how='left')

In [12]:
# Create the "label" column
merged_df['label'] = 0

In [13]:
merged_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,order_dow,order_hour_of_day,length,product_name,aisle_id,department_id,aisle,department,label
0,2,33120,1,5,9,3,Organic Egg Whites,86,16,eggs,dairy eggs,0
1,2,17794,2,5,9,3,Carrots,83,4,fresh vegetables,produce,0
2,2,9327,4,5,9,3,Garlic Powder,104,13,spices seasonings,pantry,0
3,3,17668,1,5,17,5,Unsweetened Chocolate Almond Breeze Almond Milk,91,16,soy lactosefree,dairy eggs,0
4,3,24838,2,5,17,5,Unsweetened Almondmilk,91,16,soy lactosefree,dairy eggs,0


In [14]:
df1 = merged_df[['product_id', 'order_dow', 'order_hour_of_day', 'product_name', 'aisle_id', 'department_id']].copy()
df1

Unnamed: 0,product_id,order_dow,order_hour_of_day,product_name,aisle_id,department_id
0,33120,5,9,Organic Egg Whites,86,16
1,17794,5,9,Carrots,83,4
2,9327,5,9,Garlic Powder,104,13
3,17668,5,17,Unsweetened Chocolate Almond Breeze Almond Milk,91,16
4,24838,5,17,Unsweetened Almondmilk,91,16
...,...,...,...,...,...,...
323700,7131,4,16,Light Semisoft Cheese,21,16
323701,25890,0,13,Boneless Skinless Chicken Breasts,49,12
323702,7010,0,13,Organic White Basmati Rice,63,9
323703,3142,0,13,Organic Short Grain Brown Rice,63,9


In [15]:
df1["aisle_id"].max()

134

In [16]:
# Assuming your DataFrame is named 'df' and the column containing product IDs is 'product_id'

# Create a dictionary mapping unique product IDs to consecutive integers
unique_ids = {prod_id: i for i, prod_id in enumerate(df1['product_id'].unique())}
unique_aisle = {a_id: j for j, a_id in enumerate(df1['aisle_id'].unique())}

# Add a new column 'unique_product_id' to the DataFrame with the assigned unique IDs
df1['unique_product_id'] = df1['product_id'].map(unique_ids)+1
df1['unique_aisle_id'] = df1['aisle_id'].map(unique_aisle)+1

# Verify the updated DataFrame
df1

Unnamed: 0,product_id,order_dow,order_hour_of_day,product_name,aisle_id,department_id,unique_product_id,unique_aisle_id
0,33120,5,9,Organic Egg Whites,86,16,1,1
1,17794,5,9,Carrots,83,4,2,2
2,9327,5,9,Garlic Powder,104,13,3,3
3,17668,5,17,Unsweetened Chocolate Almond Breeze Almond Milk,91,16,4,4
4,24838,5,17,Unsweetened Almondmilk,91,16,5,4
...,...,...,...,...,...,...,...,...
323700,7131,4,16,Light Semisoft Cheese,21,16,934,36
323701,25890,0,13,Boneless Skinless Chicken Breasts,49,12,68,38
323702,7010,0,13,Organic White Basmati Rice,63,9,316,41
323703,3142,0,13,Organic Short Grain Brown Rice,63,9,464,41


In [17]:
df2 = df1[['unique_product_id', 'order_dow', 'order_hour_of_day', 'unique_aisle_id', 'department_id','product_name']].copy()
df2

Unnamed: 0,unique_product_id,order_dow,order_hour_of_day,unique_aisle_id,department_id,product_name
0,1,5,9,1,16,Organic Egg Whites
1,2,5,9,2,4,Carrots
2,3,5,9,3,13,Garlic Powder
3,4,5,17,4,16,Unsweetened Chocolate Almond Breeze Almond Milk
4,5,5,17,4,16,Unsweetened Almondmilk
...,...,...,...,...,...,...
323700,934,4,16,36,16,Light Semisoft Cheese
323701,68,0,13,38,12,Boneless Skinless Chicken Breasts
323702,316,0,13,41,9,Organic White Basmati Rice
323703,464,0,13,41,9,Organic Short Grain Brown Rice


In [18]:
df2.rename(columns= {'unique_product_id': 'product_id'}, inplace=True)
df2.head()

Unnamed: 0,product_id,order_dow,order_hour_of_day,unique_aisle_id,department_id,product_name
0,1,5,9,1,16,Organic Egg Whites
1,2,5,9,2,4,Carrots
2,3,5,9,3,13,Garlic Powder
3,4,5,17,4,16,Unsweetened Chocolate Almond Breeze Almond Milk
4,5,5,17,4,16,Unsweetened Almondmilk


In [19]:
df2.rename(columns= {'unique_aisle_id': 'aisle_id'}, inplace=True)
df2.head()

Unnamed: 0,product_id,order_dow,order_hour_of_day,aisle_id,department_id,product_name
0,1,5,9,1,16,Organic Egg Whites
1,2,5,9,2,4,Carrots
2,3,5,9,3,13,Garlic Powder
3,4,5,17,4,16,Unsweetened Chocolate Almond Breeze Almond Milk
4,5,5,17,4,16,Unsweetened Almondmilk


In [20]:
# Creating a dictionary where product_id is key and aisle_id and department_id is values.
dictionary = {}

for index, row in df2.iterrows():
    key = row['product_id']
    value1 = row['aisle_id']
    value2 = row['department_id']
    dictionary[key] = [value1, value2]

print(dictionary)

{1: [1, 16], 2: [2, 4], 3: [3, 13], 4: [4, 16], 5: [4, 16], 6: [5, 4], 7: [2, 4], 8: [5, 4], 9: [6, 17], 10: [2, 4], 11: [7, 1], 12: [8, 7], 13: [9, 14], 14: [5, 4], 15: [2, 4], 16: [10, 4], 17: [2, 4], 18: [11, 4], 19: [12, 19], 20: [13, 15], 21: [14, 1], 22: [15, 13], 23: [16, 16], 24: [17, 7], 25: [12, 19], 26: [18, 7], 27: [8, 7], 28: [18, 7], 29: [19, 7], 30: [20, 1], 31: [21, 14], 32: [7, 1], 33: [22, 16], 34: [23, 7], 35: [13, 15], 36: [15, 13], 37: [19, 7], 38: [19, 7], 39: [24, 19], 40: [2, 4], 41: [7, 1], 42: [25, 16], 43: [26, 13], 44: [27, 18], 45: [5, 4], 46: [2, 4], 47: [28, 19], 48: [11, 4], 49: [29, 7], 50: [11, 4], 51: [11, 4], 52: [23, 7], 53: [2, 4], 54: [30, 3], 55: [31, 20], 56: [22, 16], 57: [11, 4], 58: [32, 16], 59: [33, 9], 60: [2, 4], 61: [5, 4], 62: [34, 1], 63: [34, 1], 64: [35, 12], 65: [36, 16], 66: [37, 1], 67: [7, 1], 68: [38, 12], 69: [4, 16], 70: [39, 17], 71: [5, 4], 72: [7, 1], 73: [11, 4], 74: [22, 16], 75: [16, 16], 76: [25, 16], 77: [26, 13], 78: 

In [21]:
# Save the dictionary as a pickle file
with open('dictionary.pkl', 'wb') as file:
    pickle.dump(dictionary, file)

print(f"Dictionary saved as 'dictionary.pkl'.")

Dictionary saved as 'dictionary.pkl'.


In [22]:
# Creating a dictionary where product_id is key and product_name is the value.
dictionary2 = {}

for index, row in df2.iterrows():
    key = row['product_id']
    value = row['product_name']
    dictionary2[key] = [value]

print(dictionary2)

# Save the dictionary as a pickle file
with open('prod_dict.pkl', 'wb') as file1:
    pickle.dump(dictionary2, file1)

print(f"Product dictionary saved as 'prod_dict.pkl'.")

{1: ['Organic Egg Whites'], 2: ['Carrots'], 3: ['Garlic Powder'], 4: ['Unsweetened Chocolate Almond Breeze Almond Milk'], 5: ['Unsweetened Almondmilk'], 6: ['Lemons'], 7: ['Organic Ginger Root'], 8: ['Organic Baby Spinach'], 9: ['One Ply Choose A Size Big Roll Paper Towel Rolls'], 10: ['Artichokes'], 11: ['Pineapple Chunks'], 12: ['Orange Juice'], 13: ['Fruit & Nutty Almonds Raisins Cranberries Pecans Granola'], 14: ['Baby Spinach'], 15: ['Green Beans'], 16: ['Organic Cilantro'], 17: ['Organic Butterhead (Boston, Butter, Bibb) Lettuce'], 18: ['Organic Strawberries'], 19: ['Tortilla Strips Restaurant Style'], 20: ['Traditional Refried Beans'], 21: ['Teriyaki & Pineapple Chicken Meatballs'], 22: ['Extra Virgin Olive Oil'], 23: ['French Vanilla Coffee Creamer'], 24: ['Ruby Red Grapefruit Juice'], 25: ['Original Potato Chips'], 26: ['Soda'], 27: ['Lemonade'], 28: ['Diet Tonic Water'], 29: ['Sparkling Natural Mineral Water'], 30: ['Organic Mini Homestyle Waffles'], 31: ['Organic Mesa Sunris

In [23]:
df2.drop(['product_name'], axis=1, inplace=True)

In [24]:
# creating positive and negative sampling
import random
from tqdm import tqdm

df_preprocessed = pd.DataFrame()

for i in range(0, 7):
    for j in range(0, 24):
        filtered = df2[(df2['order_dow'] == i) & (df2['order_hour_of_day'] == j)]
        no_s = filtered.shape[0]
        filtered['target'] = 1
        df_preprocessed = pd.concat([df_preprocessed, filtered], ignore_index=True, axis=0)
        # print(filtered)

        filtered_keys =  set(filtered['product_id'])
        # print(len(filtered_keys))
        # print(filtered_keys)

        filtered_dict = {key: value for key, value in dictionary.items() if key not in filtered_keys}
        # print(filtered_dict.keys())
        # print(filtered_dict)


        random_pairs = np.random.choice(list(filtered_dict.keys()), no_s, replace=True)
        # print(random_pairs)
        # random_dict = dict(random_pairs)

        # print(random_dict)

        data = [{'product_id': item,'order_dow': i, 'order_hour_of_day':j ,'aisle_id': dictionary[item][0], 'department_id': dictionary[item][1], 'target':0}
                for item in random_pairs]

        neg_df = pd.DataFrame(data)

        df_preprocessed = pd.concat([df_preprocessed, neg_df], ignore_index=True, axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['target'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

In [25]:
df_preprocessed

Unnamed: 0,product_id,order_dow,order_hour_of_day,aisle_id,department_id,target
0,315,0,0,20,1,1
1,103,0,0,11,4,1
2,300,0,0,5,4,1
3,881,0,0,30,3,1
4,50,0,0,11,4,1
...,...,...,...,...,...,...
647405,1773,6,23,58,19,0
647406,1192,6,23,44,16,0
647407,1435,6,23,41,9,0
647408,1877,6,23,17,7,0


In [26]:
df_preprocessed['target'].value_counts()

target
1    323705
0    323705
Name: count, dtype: int64

In [27]:
df_preprocessed['product_id'].nunique()

2965

In [28]:
df_preprocessed.to_csv('data\Preprocessed_instacart.csv', index=False)

## Checking out the most bought things on a saturday at 2 pm to validate the predictions

In [36]:
# selected_rows = merged_df[(merged_df["order_dow"] == 6) & (merged_df["order_hour_of_day"] == 14)]
# print(selected_rows.head(10))

     order_id  product_id  add_to_cart_order  order_dow  order_hour_of_day  \
307        99       38200                  1          6                 14   
308        99       35176                  3          6                 14   
309        99       46584                  5          6                 14   
729       232        8193                  1          6                 14   
730       232       17794                  2          6                 14   
731       232       44293                  3          6                 14   
732       232       39055                  4          6                 14   
733       232       21137                  5          6                 14   
867       278       15269                  1          6                 14   
868       278        3880                  2          6                 14   

     length                                     product_name  aisle_id  \
307       3                                      Apple Juice       