<a href="https://colab.research.google.com/github/kashafali8/Deep_Reinforcement_Learning_Recommenders/blob/main/Dataset2_Diginetica/src/Diginetica_Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


1) Mount you google drive with all the scripts and point to the folder containing the source code

In [1]:
# Mount Google Drive folder
from google.colab import drive
drive.mount('/content/GoogleDrive')
PROJ_DIR = '/content/GoogleDrive/My Drive/DUKE/AIPI531_DRL/Project/Data/dataset-train-diginetica'
# change current directory after mounting
%cd $PROJ_DIR
! ls

Drive already mounted at /content/GoogleDrive; to attempt to forcibly remount, call drive.mount("/content/GoogleDrive", force_remount=True).
/content/GoogleDrive/My Drive/DUKE/AIPI531_DRL/Project/Data/dataset-train-diginetica
product-categories.csv	train-clicks.csv      train-purchases.csv
products.csv		train-item-views.csv


2. Read csvs

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [3]:
views = pd.read_csv("train-item-views.csv", sep=";")
purchases = pd.read_csv("train-purchases.csv", sep=";")


In [4]:
views.head()

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate
0,1,,81766,526309,2016-05-09
1,1,,31331,1031018,2016-05-09
2,1,,32118,243569,2016-05-09
3,1,,9654,75848,2016-05-09
4,1,,32627,1112408,2016-05-09


In [5]:
purchases.head()

Unnamed: 0,sessionId,userId,timeframe,eventdate,ordernumber,itemId
0,150,18278.0,17100868,2016-05-06,16421,25911
1,151,,6454547,2016-05-06,16290,175874
2,156,7.0,1721689387,2016-05-27,21173,35324
3,179,,343001,2016-05-09,16924,31233
4,246,34.0,2311046,2016-05-09,16936,34677


3. Preprocess Data

In [6]:
# We will use users instead of sessions --> there is more overlap between the two datasets
# keep only columns we need
cols = ["userId", "itemId", "eventdate"]
views = views[cols]
purchases = purchases[cols]

In [7]:
# drop nas and change to int
views = views.dropna().astype({"userId": int})
purchases = purchases.dropna().astype({"userId": int})

In [8]:
# change eventdate to sortable time
views["eventdate"] = pd.to_datetime(views["eventdate"])
purchases["eventdate"] = pd.to_datetime(purchases["eventdate"])

In [9]:
# rename columns
rename_cols = {"userId": "session_id", "itemId": "item_id", "eventdate": "timestamp"}
views = views.rename(columns=rename_cols)
purchases = purchases.rename(columns=rename_cols)

In [10]:
# outer merge with indicator
events = pd.merge(
    views,
    purchases,
    how="outer",
    on=["session_id", "item_id", "timestamp"],
    indicator=True,
)

In [11]:
# if _merge is left_only, then it was viewed not purchased, else it was purchased
events["is_buy"] = np.where(events["_merge"] == "left_only", 0, 1)
events.drop("_merge", axis=1, inplace=True)

In [12]:
######## transform to ids
item_encoder = LabelEncoder()
session_encoder = LabelEncoder()
events["item_id"] = item_encoder.fit_transform(events.item_id)
events["session_id"] = session_encoder.fit_transform(events.session_id)


In [13]:
##########sorted by user and timestamp
sorted_events = events.sort_values(by=["session_id", "timestamp"])


In [14]:
output_path = '/content/GoogleDrive/My Drive/DUKE/AIPI531_DRL/Project/Data/Diginetica.csv'
sorted_events.to_csv(output_path, index=False)

In [15]:
!ls '/content/GoogleDrive/My Drive/DUKE/AIPI531_DRL/Project/Data/'


dataset-train-diginetica  dataset-train-diginetica.zip	Diginetica.csv
