Import pacakges

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from utility import to_pickled_df
import random
import datetime

**create sorted data**

In [2]:
data_directory = 'DATA'
df = pd.read_csv(os.path.join(data_directory,"HM_new.csv"),header = 0)
df.columns = ['transid','timestamp','user_id','behavior','item_id']
df.head()

Unnamed: 0,transid,timestamp,user_id,behavior,item_id
0,0,2018/9/20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,1,663713001
1,1,2018/9/20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,1,541518023
2,2,2018/9/20,00007d2de826758b65a93dd24ce629ed66842531df6699...,1,505221004
3,3,2018/9/20,00007d2de826758b65a93dd24ce629ed66842531df6699...,1,685687003
4,4,2018/9/20,00007d2de826758b65a93dd24ce629ed66842531df6699...,1,685687004


In [3]:
###remove transid column
df = df.drop('transid',axis=1)

In [5]:
print(len(df['item_id'].unique()))
print(len(df['user_id'].unique()))

29574
222025


Add no interaction data

In [6]:
## sample 3000 items and users to genrate a "not buy" dataset
item_ids = random.sample(list(df['item_id'].unique()), 3000)
user_ids = random.sample(list(df['user_id'].unique()), 3000)

In [7]:
## Adding negative feedback (0 ranking) for instances of no interaction between items and users
not_buy = [[user, item, 0] for item in item_ids for user in user_ids] 

# Convert prepared data into a dataframe
not_buy = pd.DataFrame(data=not_buy, columns=["user_id", "item_id", "behavior"])

not_buy.head()

Unnamed: 0,user_id,item_id,behavior
0,8338f8e22e4ca7e697dcf01ccac2172453f9dc1ca0cd6f...,631291001,0
1,259cba85a45a0776c93f3d21c21d2f131bed8bf8afdfd0...,631291001,0
2,fb29458005f3b676ba3b502c8bc7fe54ca4275235d856f...,631291001,0
3,7850b8c83511e9238109ce424aafee3cc725838c57ffbe...,631291001,0
4,012a200b2105592aea9e0e0f6e64326c374987c64270fc...,631291001,0


In [8]:
## Merge the positive and negative feedback into one single master dataframe
df = pd.merge(not_buy, df, on=['user_id', 'item_id'], how='outer').fillna(0).drop('behavior_x', axis = 1)

# Cleaning up the column names
df.rename(columns = {'behavior_y': 'behavior'}, inplace = True)
df.head()

Unnamed: 0,user_id,item_id,timestamp,behavior
0,8338f8e22e4ca7e697dcf01ccac2172453f9dc1ca0cd6f...,631291001,0,0.0
1,259cba85a45a0776c93f3d21c21d2f131bed8bf8afdfd0...,631291001,0,0.0
2,fb29458005f3b676ba3b502c8bc7fe54ca4275235d856f...,631291001,0,0.0
3,7850b8c83511e9238109ce424aafee3cc725838c57ffbe...,631291001,0,0.0
4,012a200b2105592aea9e0e0f6e64326c374987c64270fc...,631291001,0,0.0


In [16]:
## Check number of positive and negative feedback samples
df['behavior'].value_counts()

0.0    8998609
1.0    1048575
Name: behavior, dtype: int64

In [9]:
df["timestamp"] = pd.to_datetime(df.timestamp, format="%Y/%m/%d").astype(int) / 10**9

In [11]:
df = df.rename(columns={'user_id': 'session_id'})
df.head()

Unnamed: 0,session_id,item_id,timestamp,behavior
0,8338f8e22e4ca7e697dcf01ccac2172453f9dc1ca0cd6f...,631291001,0.0,0.0
1,259cba85a45a0776c93f3d21c21d2f131bed8bf8afdfd0...,631291001,0.0,0.0
2,fb29458005f3b676ba3b502c8bc7fe54ca4275235d856f...,631291001,0.0,0.0
3,7850b8c83511e9238109ce424aafee3cc725838c57ffbe...,631291001,0.0,0.0
4,012a200b2105592aea9e0e0f6e64326c374987c64270fc...,631291001,0.0,0.0


In [12]:
##########remove users with <=2 interactions
df['valid_session'] = df.session_id.map(df.groupby('session_id')['item_id'].size() > 2)
df = df.loc[df.valid_session].drop('valid_session', axis=1)
##########remove items with <=2 interactions
df['valid_item'] = df.item_id.map(df.groupby('item_id')['session_id'].size() > 2)
df = df.loc[df.valid_item].drop('valid_item', axis=1)
######## transform to ids
# item_encoder = LabelEncoder()
session_encoder= LabelEncoder()
behavior_encoder=LabelEncoder()
# event_df['item_id'] = item_encoder.fit_transform(event_df.item_id)
df['session_id'] = session_encoder.fit_transform(df.session_id)
df['behavior']=behavior_encoder.fit_transform(df.behavior)
###########sorted by user and timestamp
# df['is_buy']=1-df['behavior']
df['is_buy']=df['behavior']
df = df.drop('behavior', axis=1)
sorted_events = df.sort_values(by=['session_id', 'timestamp'])

In [15]:
sorted_events.head()

Unnamed: 0,session_id,item_id,timestamp,is_buy
9050197,0,583558001,1537488000.0,1
9050199,0,639677008,1537488000.0,1
9050200,0,640244003,1537488000.0,1
9217007,0,521269001,1537834000.0,1
9217008,0,666448006,1537834000.0,1


In [18]:
## Check number of positive and negative feedback samples
sorted_events['is_buy'].value_counts()

0    8998609
1     908514
Name: is_buy, dtype: int64

In [19]:
# read item properties files
articles = pd.read_csv("/Users/chloeliu/Desktop/Duke/2023 sem1/aipi RL/final project/H&M Data/articles.csv")
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [20]:
# only include item with categoryid
articles = articles[["article_id","product_type_no"]]

In [21]:
event_with_prop = pd.merge(sorted_events, articles, how='left', left_on=['item_id'], right_on=['article_id'])
event_with_prop.drop(columns=['article_id'], inplace=True)
event_with_prop.head()

Unnamed: 0,session_id,item_id,timestamp,is_buy,product_type_no
0,0,583558001,1537488000.0,1,265
1,0,639677008,1537488000.0,1,259
2,0,640244003,1537488000.0,1,275
3,0,521269001,1537834000.0,1,252
4,0,666448006,1537834000.0,1,252


In [22]:
event_with_prop.isnull().sum()

session_id         0
item_id            0
timestamp          0
is_buy             0
product_type_no    0
dtype: int64

In [23]:
# label encode categoryid and item_id
item_encoder = LabelEncoder()
event_with_prop['item_id'] = item_encoder.fit_transform(event_with_prop.item_id)
event_with_prop.head()

Unnamed: 0,session_id,item_id,timestamp,is_buy,product_type_no
0,0,8369,1537488000.0,1,265
1,0,15550,1537488000.0,1,259
2,0,15680,1537488000.0,1,275
3,0,3581,1537834000.0,1,252
4,0,19421,1537834000.0,1,252


In [24]:
event_with_prop.rename(columns = {'product_type_no': 'categoryid'}, inplace = True)

In [25]:
event_with_prop.head()

Unnamed: 0,session_id,item_id,timestamp,is_buy,categoryid
0,0,8369,1537488000.0,1,265
1,0,15550,1537488000.0,1,259
2,0,15680,1537488000.0,1,275
3,0,3581,1537834000.0,1,252
4,0,19421,1537834000.0,1,252


**make a dataframe with each item's corresponded one-hot-encodede category id.**

In [35]:
# make a dataframe with each unique item id and one hot encoted category id
item_category = event_with_prop[['item_id', 'categoryid']].drop_duplicates()
item_category = pd.get_dummies(item_category, columns=['categoryid'])
# merge rows with same item_id
item_category = item_category.groupby('item_id').sum()
item_category.head()

Unnamed: 0_level_0,categoryid_-1,categoryid_49,categoryid_57,categoryid_59,categoryid_60,categoryid_66,categoryid_67,categoryid_68,categoryid_69,categoryid_70,...,categoryid_349,categoryid_351,categoryid_495,categoryid_496,categoryid_499,categoryid_508,categoryid_511,categoryid_512,categoryid_515,categoryid_532
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# save files
# event_with_prop.to_csv('/Users/yayun/Desktop/MIDS/2023Spring/AIPI531/aipi_final_project/SA2C_code/Kaggle/data2/event_with_prop.csv', index=None, header=True)
event_with_prop.to_pickle('./Data/event_with_prop.pkl')
# item_category.to_csv('/Users/yayun/Desktop/MIDS/2023Spring/AIPI531/aipi_final_project/SA2C_code/Kaggle/data2/item_category.csv', header=True)
item_category.to_pickle('./Data/item_category.pkl')

# Important
After saving these files, remeber to use `split_data.py` and `replay_buffer.py` to generate the following files:
- data_statis.pkl
- replay_buffer.pkl
- sampled_test.pkl
- sampled_train.pkl
- sampled_val.pkl