In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from scipy.sparse import coo_matrix, csr_matrix, save_npz
from scipy.sparse.linalg import svds
import implicit
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle
import ast

# make pandas show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#reading dataset from parquet file
df = pd.read_parquet('../datasets/train.parquet')
df.head()

Unnamed: 0,date,userId,sessionId,pageType,itemId,category,productPrice,oldProductPrice
0,2019-08-05 19:30:37,00172f1d9a71e9a8de0aa34288a6b19b,e8167c23f8ac2f9be979c32380e0fc2b7e94941e917d30...,productDetail,83472aea4051c00d031b01ff42ef73fc,"[""kadın çanta"",""omuz askılı çanta""]",622.0,1220.0
1,2019-08-31 16:53:55,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,category,[],"[""seyahat samsonite"",""laptop çantası""]",,
2,2019-08-31 16:53:29,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,main,[],[],,
3,2019-08-31 16:53:43,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,category,[],"[""seyahat samsonite"",""laptop çantası""]",,
4,2019-08-31 16:54:13,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,productDetail,d6afa22ab475d41e7dc9b721f3f795ad,"[""seyahat samsonite"",""laptop çantası""]",389.0,389.0


# DATA ANALYSIS

## Null Kontrolu

In [3]:
df.userId.nunique()

30133

In [4]:
len(df)

691842

In [5]:
# null check
df.isnull().sum()

date                    0
userId                  0
sessionId               0
pageType                0
itemId                  0
category                0
productPrice       473857
oldProductPrice    473857
dtype: int64

In [6]:
df.pageType.value_counts()

pageType
category         344349
productDetail    287731
main              40367
cart              18790
success             605
Name: count, dtype: int64


productPrice ile oldProductPrice beraber bir sekildemi nulllar onu kontrol edecegim

In [7]:
# check if productPrice and oldProductPrice are null at the same time always (row wise)
productPrice_null_oldProductPrice_not_null = len(df[(df['productPrice'].isnull()) & (df['oldProductPrice'].notnull())])

# oldProductPrice null iken productPrice'ın null olmadığı durumlar
oldProductPrice_null_productPrice_not_null = len(df[(df['oldProductPrice'].isnull()) & (df['productPrice'].notnull())])

# İki durumun toplamı
total = productPrice_null_oldProductPrice_not_null + oldProductPrice_null_productPrice_not_null
print(total)


0


Evet productPrice ile oldProductPrice beraber bir sekilde nullar, yani biri null iken digeri dolu olmuyor. Bunlari doldurmak icin bir yol bulmamiz gerekiyor.

In [8]:
# pageType'a gore null check
print("productPrice Null Counts")
for pageType in df.pageType.unique():
    print(pageType, df[df.pageType == pageType]['productPrice'].isnull().sum())

productPrice Null Counts
productDetail 69746
category 344349
main 40367
cart 18790
success 605


fiyatlar category, main, cart kisminda null gelmesi normal oyle olmasi gerekiyor zaten ama productDetail kisminda null olmamasi gerekiyordu bunu kurtalabilir miyiz diye bakacagim.

Bunun icin urunun fiyati acaba bir onceki sessionlarda varmiydi ona bakacagim. Eger varsa onu kullanacagim yoksa null birakacagim.

In [9]:
# pageType'a gore null check
print("productPrice Null Counts")
for pageType in df.pageType.unique():
    print(pageType, df[df.pageType == pageType]['productPrice'].isnull().sum())

productPrice Null Counts
productDetail 69746
category 344349
main 40367
cart 18790
success 605


In [10]:

df['date'] = pd.to_datetime(df['date'])
# Veri setini itemId ve date bazında sıralama
df = df.sort_values(by=['itemId', 'date'])

# En yakın dolu değerle doldurma fonksiyonu
def fill_missing_with_nearest(df, column):
    dataframe = df.copy()
    for idx, row in dataframe.iterrows():
        if pd.isna(row[column]):
            # Aynı itemId içindeki dolu değerleri bul
            non_na_values = dataframe[~dataframe[column].isna()]
            if not non_na_values.empty:
                # En yakın tarihi bul
                nearest_idx = (non_na_values['date'] - row['date']).abs().idxmin()
                dataframe.at[idx, column] = dataframe.at[nearest_idx, column]
    return dataframe

# Her itemId için eksik değerleri doldurma
df = df.groupby('itemId').apply(lambda group: fill_missing_with_nearest(group, 'productPrice')).reset_index(drop=True)
df = df.groupby('itemId').apply(lambda group: fill_missing_with_nearest(group, 'oldProductPrice')).reset_index(drop=True)


  df = df.groupby('itemId').apply(lambda group: fill_missing_with_nearest(group, 'productPrice')).reset_index(drop=True)
  df = df.groupby('itemId').apply(lambda group: fill_missing_with_nearest(group, 'oldProductPrice')).reset_index(drop=True)


tekrardan null kontrolu yapiyorum

In [11]:
# pageType'a gore null check
print("productPrice Null Counts")
for pageType in df.pageType.unique():
    print(pageType, df[df.pageType == pageType]['productPrice'].isnull().sum())

productPrice Null Counts
productDetail 2
cart 18790
success 605
category 344349
main 40367


productDetail'da 2 tane null var bu sayi 700.000 satirlik veride ihmal edilebilir oldugu icin dropluyorum

In [12]:
# pageType==productDetail ise ve null varsa dropla
df = df[~((df.pageType == 'productDetail') & (df.productPrice.isnull()))]

In [13]:
# pageType'a gore null check
print("productPrice Null Counts")
for pageType in df.pageType.unique():
    print(pageType, df[df.pageType == pageType]['productPrice'].isnull().sum())

productPrice Null Counts
productDetail 0
cart 18790
success 605
category 344349
main 40367


pagetype== productDetail haricinde zaten urunun fiyati null gelecekti modelin fiyat konusundaki agirligi ne olursa olsun fiyat 0 ile doldurursam carpim 0 olacagindan 0 ile doldurabilirim diye dusundum ancak success kismini doldurabilirim fakat orada bazen birden fazla urun oluyor ve bunun icin product price kismini listeye cevirmem gerekiyor, 605/700.000 satirlik veri icin bunu yapmak mantikli gelmedi o sebeple 0 ile doldurmayi tercih ettim.


In [14]:
df['productPrice'] = df['productPrice'].fillna(0)
df['oldProductPrice'] = df['oldProductPrice'].fillna(0)

In [15]:
df.isnull().sum()

date               0
userId             0
sessionId          0
pageType           0
itemId             0
category           0
productPrice       0
oldProductPrice    0
dtype: int64

artik null verimiz yok

In [16]:
df.head()

Unnamed: 0,date,userId,sessionId,pageType,itemId,category,productPrice,oldProductPrice
0,2019-08-03 21:12:27,9c88dc8a43a48e2fcc648467971a17c2,1c50b964c487bfbc17945c636ec26c9189bca2739b2ea5...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[""kadın ayakkabi"",""günlük ayakkabı""]",169.0,558.0
1,2019-08-04 18:27:44,2d0686f5e78772987a66197305a22925,b4d9b6249354494000a847615eeb0cd8e99165a6e3ff0e...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[""kadın ayakkabi"",""günlük ayakkabı""]",169.0,558.0
2,2019-08-04 21:33:40,9c88dc8a43a48e2fcc648467971a17c2,5ebb9403eba9e14b4cb5184da53542ad27e680da2bd687...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[""kadın ayakkabi"",""günlük ayakkabı""]",169.0,558.0
3,2019-08-05 06:34:26,4c59bebe04fad7eae9c5235acdb3201a,687a6d09b1dcaf07dd20a47901cb46f7d938ee097e45bf...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[""kadın ayakkabi"",""günlük ayakkabı""]",169.0,558.0
4,2019-08-05 06:34:30,4c59bebe04fad7eae9c5235acdb3201a,687a6d09b1dcaf07dd20a47901cb46f7d938ee097e45bf...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[""kadın ayakkabi"",""günlük ayakkabı""]",169.0,558.0


# duplicate kontrolu

In [17]:
# duplicate check
df.duplicated().sum()


15426

In [18]:
df = df.drop_duplicates()

In [19]:
len(df)

676414

In [20]:
df.head()

Unnamed: 0,date,userId,sessionId,pageType,itemId,category,productPrice,oldProductPrice
0,2019-08-03 21:12:27,9c88dc8a43a48e2fcc648467971a17c2,1c50b964c487bfbc17945c636ec26c9189bca2739b2ea5...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[""kadın ayakkabi"",""günlük ayakkabı""]",169.0,558.0
1,2019-08-04 18:27:44,2d0686f5e78772987a66197305a22925,b4d9b6249354494000a847615eeb0cd8e99165a6e3ff0e...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[""kadın ayakkabi"",""günlük ayakkabı""]",169.0,558.0
2,2019-08-04 21:33:40,9c88dc8a43a48e2fcc648467971a17c2,5ebb9403eba9e14b4cb5184da53542ad27e680da2bd687...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[""kadın ayakkabi"",""günlük ayakkabı""]",169.0,558.0
3,2019-08-05 06:34:26,4c59bebe04fad7eae9c5235acdb3201a,687a6d09b1dcaf07dd20a47901cb46f7d938ee097e45bf...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[""kadın ayakkabi"",""günlük ayakkabı""]",169.0,558.0
4,2019-08-05 06:34:30,4c59bebe04fad7eae9c5235acdb3201a,687a6d09b1dcaf07dd20a47901cb46f7d938ee097e45bf...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[""kadın ayakkabi"",""günlük ayakkabı""]",169.0,558.0


kac adet user var

In [44]:
df.userId.nunique()

30133

hangi urunler kac kere satilmis 

In [21]:
# Filter the DataFrame to get rows where pageType is 'success'
success_df = df[df['pageType'] == 'success']

# Count the occurrences of each itemId
item_success_count = success_df['itemId'].value_counts().reset_index()

# Rename the columns for clarity
item_success_count.columns = ['itemId', 'success_count']

# Display the result
print("Items that went to success and their counts:")
item_success_count.head(10)

Items that went to success and their counts:


Unnamed: 0,itemId,success_count
0,[],62
1,f317686d80c6f08ed1c76ed22a434ec1,7
2,dd9531a916d3260dc61df5f400e2792b,4
3,ab75332a5a6eb7ef0b50563f5e4421c0,4
4,783c654fc65aeabd3c0ea3d5950b73a3,4
5,"[""fd38ce64a2d0f3d3072dc0c0a408d9be"",""a11bb2426...",3
6,"[""2e9a0492b2a2add658430c1f146764ce"",""d0fb2ac5f...",3
7,002aec8f0aa690203bd0626ff08f50d4,3
8,97de4ec3265f46c53e24d2f6e490b191,3
9,"[""08dd068ebe6a70780b1555646a6a9ca4"",""add9cce0a...",2


bazi urunler success'de bos gelmis bunlari ilgili sessiondan sepete eklenen urunlerle doldurabilip bir varsayim yapilabilir.

In [22]:
# Create a copy of the DataFrame to work on
df_copy = df.copy()

# Create a mapping of sessionId to itemId for 'cart' pageType
cart_items = df_copy[df_copy['pageType'] == 'cart'][['sessionId', 'itemId']].drop_duplicates()

# Create a dictionary for quick lookup
cart_item_dict = cart_items.set_index('sessionId')['itemId'].to_dict()

# Fill empty itemIds in 'success' pageType with corresponding 'cart' itemIds
def fill_item_id(row):
    if row['pageType'] == 'success' and row['itemId'] == '[]':
        return cart_item_dict.get(row['sessionId'], row['itemId'])
    return row['itemId']

df_copy['itemId'] = df_copy.apply(fill_item_id, axis=1)

# Filter the DataFrame to get rows where pageType is 'success'
success_df = df_copy[df_copy['pageType'] == 'success']

# Count the occurrences of each itemId
item_success_count = success_df['itemId'].value_counts().reset_index()

# Rename the columns for clarity
item_success_count.columns = ['itemId', 'success_count']

# Display the result
print("Items that went to success and their counts:")
item_success_count.head(10)

Items that went to success and their counts:


Unnamed: 0,itemId,success_count
0,[],61
1,f317686d80c6f08ed1c76ed22a434ec1,7
2,dd9531a916d3260dc61df5f400e2792b,4
3,ab75332a5a6eb7ef0b50563f5e4421c0,4
4,783c654fc65aeabd3c0ea3d5950b73a3,4
5,"[""fd38ce64a2d0f3d3072dc0c0a408d9be"",""a11bb2426...",3
6,"[""2e9a0492b2a2add658430c1f146764ce"",""d0fb2ac5f...",3
7,002aec8f0aa690203bd0626ff08f50d4,3
8,97de4ec3265f46c53e24d2f6e490b191,3
9,"[""08dd068ebe6a70780b1555646a6a9ca4"",""add9cce0a...",2


In [23]:
len(df[(df.itemId == '[]') & (df.pageType == 'success')])

62

In [24]:
len(df_copy[(df_copy.itemId == '[]') & (df_copy.pageType == 'success')])

61

1 tane success gelen satirin urunlerini bu varsayimla doldurmus olduk, success score icin cok onemli bir etkilesim oldugundan bu onemliydi. Ama sadece 1 tane pek isimize yaramaz o sebeple productDetail ile doldurmaya calisacagim

In [25]:
# Create a mapping of sessionId to itemId for 'productDetail' pageType
productDetail_items = df_copy[df_copy['pageType'] == 'productDetail'][['sessionId', 'itemId']].drop_duplicates()

# Create a dictionary for quick lookup
productDetail_item_dict = productDetail_items.set_index('sessionId')['itemId'].to_dict()

# Fill empty itemIds in 'success' pageType with corresponding 'productDetail' itemIds
def fill_item_id(row):
    if row['pageType'] == 'success' and row['itemId'] == '[]':
        return productDetail_item_dict.get(row['sessionId'], row['itemId'])
    return row['itemId']

df_copy['itemId'] = df_copy.apply(fill_item_id, axis=1)

# Filter the DataFrame to get rows where pageType is 'success'
success_df = df_copy[df_copy['pageType'] == 'success']

# Count the occurrences of each itemId
item_success_count = success_df['itemId'].value_counts().reset_index()

# Rename the columns for clarity
item_success_count.columns = ['itemId', 'success_count']

# Display the result
print("Items that went to success and their counts:")
item_success_count.head(10)

Items that went to success and their counts:


Unnamed: 0,itemId,success_count
0,[],50
1,f317686d80c6f08ed1c76ed22a434ec1,7
2,dd9531a916d3260dc61df5f400e2792b,4
3,ab75332a5a6eb7ef0b50563f5e4421c0,4
4,783c654fc65aeabd3c0ea3d5950b73a3,4
5,"[""fd38ce64a2d0f3d3072dc0c0a408d9be"",""a11bb2426...",3
6,"[""2e9a0492b2a2add658430c1f146764ce"",""d0fb2ac5f...",3
7,002aec8f0aa690203bd0626ff08f50d4,3
8,97de4ec3265f46c53e24d2f6e490b191,3
9,4f6e4a3d25267ff13fe3ee26279929d2,2


In [26]:
len(df[(df.itemId == '[]') & (df.pageType == 'success')])

62

In [27]:
len(df_copy[(df_copy.itemId == '[]') & (df_copy.pageType == 'success')])

50

totalde 12 satiri doldurmus olduk, 12/700.000 veri icin cok onemli bir etkilesim olmayacaktir ama yinede doldurmus olduk.

In [28]:
df = df_copy

In [29]:
#  inceleme yapabilmek icin sessionid bazli gruplayalim ve listeleri olusturalim
#date	userId	sessionId	pageType	itemId	category	productPrice	oldProductPrice

session_df = df.groupby('sessionId').agg({'date': list, 'userId': 'first', 'pageType': list, 'itemId': list, 'category': list, 'productPrice': list, 'oldProductPrice': list}).reset_index()
session_df.head()

Unnamed: 0,sessionId,date,userId,pageType,itemId,category,productPrice,oldProductPrice
0,000105bc68252a56fdddca8281efb9c5066fb07657a26f...,"[2019-08-17 08:55:01, 2019-08-17 08:55:12, 201...",ccddb7b61a1be6563442a0de6396ffe8,"[main, category, category]","[[], [], []]","[[], [""erkek ayakkabı""], [""erkek ayakkabı""]]","[0.0, 0.0, 0.0]","[0.0, 0.0, 0.0]"
1,00021ca56bfe2fb4f850ee1aff30f965de667af0499788...,"[2019-08-21 07:51:21, 2019-08-21 07:50:25, 201...",7670b27dcd2805736b5efb8e2ef06917,"[productDetail, productDetail, cart, productDe...","[0cfe73fe7d6e828ecdf0a34067e599cc, 5f82f9ecd32...","[[""seyahat american tourister"",""kabin boy vali...","[629.3, 681.8, 0.0, 751.8, 751.8]","[899.0, 974.0, 0.0, 1074.0, 1074.0]"
2,00028c4adf9526f8a8fc10dadc179e1422e322f05b9f9f...,"[2019-08-06 12:45:34, 2019-08-06 12:46:01, 201...",e6dab6690693ed4030012fd960646dc7,"[productDetail, productDetail, productDetail, ...","[01efa5eae8ef33c210a36ce0ee6f7b6d, 01efa5eae8e...","[[""kadın çanta"",""omuz askılı çanta""], [""kadın ...","[199.0, 199.0, 999.0, 999.0, 999.0, 999.0, 999...","[398.0, 398.0, 2220.0, 2220.0, 2220.0, 2220.0,..."
3,00030ec7df8cd2df5c9da141ab35498cafeaf42390672b...,"[2019-08-21 14:24:45, 2019-08-21 14:25:14, 201...",a7be251452218a849cbe05567de885d4,"[productDetail, productDetail, productDetail]","[0ec629815359e346a6794b909308c553, 0ec62981535...","[[""seyahat samsonite"",""çocuk""], [""seyahat sams...","[899.0, 899.0, 899.0]","[899.0, 899.0, 899.0]"
4,000420538c99fc4aa9e053540fa6ba96218c77274d8073...,"[2019-08-02 13:24:55, 2019-08-02 13:24:51, 201...",906bd8519f5e2244ab542823621940ce,"[productDetail, productDetail, productDetail]","[5b463ff68bf262af4eaa07efa3e7c2d6, b15b19b2ff0...","[[""seyahat samsonite"",""kadın çanta""], [""seyaha...","[699.0, 699.0, 699.0]","[699.0, 699.0, 699.0]"


simdi pagetype success olanlarin categorilerini sayalim, boylelikle hangi categoriler daha cok satilmis bir fikir edinebiliriz.

In [30]:
# Convert string representation of lists to actual lists

# Filter rows where 'success' is in 'pageType'
session_df['has_success'] = session_df['pageType'].apply(lambda x: 'success' in x)

# Only keep rows with success in pageType
success_sessions = session_df[session_df['has_success']]

# Explode the 'category' list to count each category separately
exploded_categories = success_sessions.explode('category')

# Count the occurrences of each category
category_success_count = exploded_categories['category'].value_counts().reset_index()

# Rename the columns for clarity
category_success_count.columns = ['category', 'success_count']

# Display the result
print("Categories that went to success and their counts:")
category_success_count.head(20)



Categories that went to success and their counts:


Unnamed: 0,category,success_count
0,[],3828
1,"[""kadın ayakkabi"",""sandalet""]",1936
2,"[""kadın ayakkabi"",""günlük ayakkabı""]",1762
3,"[""erkek ayakkabı"",""günlük ayakkabı""]",990
4,"[""kadın ayakkabi""]",861
5,"[""kadın ayakkabi"",""spor ayakkabı""]",571
6,"[""kadın çanta"",""omuz askılı çanta""]",533
7,"[""kadın aksesuar"",""deri cüzdan""]",406
8,"[""kadın ayakkabi"",""klasik ayakkabı""]",404
9,"[""kadın ayakkabi"",""topuklu ayakkabı""]",335


# OTHER PREPROCESSING

Category kolonunu listeye ceviriyorum

In [31]:
df['category'] = df['category'].apply(ast.literal_eval)
df.head()

Unnamed: 0,date,userId,sessionId,pageType,itemId,category,productPrice,oldProductPrice
0,2019-08-03 21:12:27,9c88dc8a43a48e2fcc648467971a17c2,1c50b964c487bfbc17945c636ec26c9189bca2739b2ea5...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",169.0,558.0
1,2019-08-04 18:27:44,2d0686f5e78772987a66197305a22925,b4d9b6249354494000a847615eeb0cd8e99165a6e3ff0e...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",169.0,558.0
2,2019-08-04 21:33:40,9c88dc8a43a48e2fcc648467971a17c2,5ebb9403eba9e14b4cb5184da53542ad27e680da2bd687...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",169.0,558.0
3,2019-08-05 06:34:26,4c59bebe04fad7eae9c5235acdb3201a,687a6d09b1dcaf07dd20a47901cb46f7d938ee097e45bf...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",169.0,558.0
4,2019-08-05 06:34:30,4c59bebe04fad7eae9c5235acdb3201a,687a6d09b1dcaf07dd20a47901cb46f7d938ee097e45bf...,productDetail,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",169.0,558.0


itemid-userid pair dataframe'i olusturuyorum

In [32]:
itemid_userid_df = df.copy()
itemid_userid_df = itemid_userid_df[['userId', 'itemId', 'category']]
itemid_userid_df['score'] = 0
itemid_userid_df.head()

Unnamed: 0,userId,itemId,category,score
0,9c88dc8a43a48e2fcc648467971a17c2,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",0
1,2d0686f5e78772987a66197305a22925,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",0
2,9c88dc8a43a48e2fcc648467971a17c2,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",0
3,4c59bebe04fad7eae9c5235acdb3201a,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",0
4,4c59bebe04fad7eae9c5235acdb3201a,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",0


In [33]:


# itemId sütununu kontrol etme ve düzeltme
def process_item_id(item):
    if isinstance(item, str):
        if item.startswith('[') and item.endswith(']'):
            return literal_eval(item)
        elif item == '[]':
            return []
        else:
            return [item]
    return item

itemid_userid_df['itemId'] = itemid_userid_df['itemId'].apply(process_item_id)

# Boş itemId'leri içeren satırları filtreleme
itemid_userid_df = itemid_userid_df[itemid_userid_df['itemId'].apply(lambda x: len(x) > 0)]

# Çoklu itemId'leri ayrı satırlara genişletme
itemid_userid_df = itemid_userid_df.explode('itemId').reset_index(drop=True)

# Duplikateleri kaldırma
itemid_userid_df = itemid_userid_df.drop_duplicates(subset=['userId', 'itemId']).reset_index(drop=True)

# Sonucu kontrol etme
itemid_userid_df.head()

Unnamed: 0,userId,itemId,category,score
0,9c88dc8a43a48e2fcc648467971a17c2,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",0
1,2d0686f5e78772987a66197305a22925,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",0
2,4c59bebe04fad7eae9c5235acdb3201a,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",0
3,95027d9638ccbe419a8dba9481539309,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",0
4,7743a7f20fe4aac9e884ddd6b90d4243,0000e475c4e76a1f4a615f33bbba9e79,"[kadın ayakkabi, günlük ayakkabı]",0


simdi skorlari hesaplayacagim

ilk olarak pageType==category iken dogal olarak itemid gelmiyor fakat category geliyor, bu yuzden eger item ile ilgili category incelendiyse skor ekliyorum

In [34]:
df_only_category = df[df['pageType'] == 'category']
df_only_category.head()

Unnamed: 0,date,userId,sessionId,pageType,itemId,category,productPrice,oldProductPrice
196673,2019-08-01 00:00:06,107b0f3ae1fab10dcfcd4b98500dbc00,f23f9095d156d45e52d7cf69b7e2d6b7f8300834ffea57...,category,[],[erkek ayakkabı],0.0,0.0
196674,2019-08-01 00:00:12,b16b4503a784977a0d6029b1eb856ea4,dbbaf92d5b77b2595e20990deb08e1354ee146980ce1bb...,category,[],"[erkek ayakkabı, klasik ayakkabı]",0.0,0.0
196675,2019-08-01 00:01:35,ab50c365857d28ac346e457e78d0300d,341e7ca1327eaa3a94ce37c0e59119ba803628ff827ecb...,category,[],[derigiyim genel],0.0,0.0
196676,2019-08-01 00:03:06,34281575a7d7587d802f9e366a1c547e,2a6f40420b82deffded691561467ac159fca0347076d85...,category,[],"[kadın ayakkabi, sandalet]",0.0,0.0
196677,2019-08-01 00:03:39,7afbbbce279f4a4e58a540680572d380,91a06e81ed0831fc0d49a397de6f1a101e761daba8d154...,category,[],[seyahat samsonite],0.0,0.0


In [35]:
# Her kullanıcı ve kategori çiftinin kaç kez tekrarlandığını hesaplayalım
user_category_counts = df_only_category.explode('category').groupby(['userId', 'category']).size().reset_index(name='count')

# itemid_userid_df ile user_category_counts'ı birleştir
itemid_userid_df = itemid_userid_df.explode('category')
itemid_userid_df = itemid_userid_df.merge(user_category_counts, on=['userId', 'category'], how='left')

# Skorları güncelleme
itemid_userid_df['count'] = itemid_userid_df['count'].fillna(0)
itemid_userid_df['score'] += itemid_userid_df['count'].astype(int)

# Gereksiz sütunları kaldırma ve itemId'leri tekrar gruplama
itemid_userid_df = itemid_userid_df.drop(columns=['category', 'count']).drop_duplicates()
itemid_userid_df = itemid_userid_df.groupby(['userId', 'itemId']).sum().reset_index()

# Güncellenmiş veri çerçevesini gösterelim

itemid_userid_df.head()


Unnamed: 0,userId,itemId,score
0,0001d86ea81e6eef12cebaa1dcbdadc2,3fe466cbc67f4352be350f0c46bf2c2c,0
1,000a53fe09a2a3decd11b6b30d703b9c,1b9c9f89b863877545687dd4f2e60153,0
2,000a53fe09a2a3decd11b6b30d703b9c,1d5d28877bfd3f288be22468599e93f8,0
3,000a53fe09a2a3decd11b6b30d703b9c,6581b8a02f7c68b4a8bf794c16a0ac32,0
4,000a53fe09a2a3decd11b6b30d703b9c,7ce2ecaccdd217cb97c864701620461c,0


kullanicinin urune direk etkilesimlerini skorlandiriyorum

In [36]:
# pageType'lara göre skorları arttırma
page_type_scores = {
    'productDetail': 20,
    'cart': 100,
    'success': 500
}

# df'yi itemid_userid_df ile birleştirip pageType'a göre skorları güncelleyelim
df_filtered = df[df['pageType'].isin(page_type_scores.keys())]
df_filtered = df_filtered.merge(itemid_userid_df, on=['userId', 'itemId'], how='inner')

for page_type, score in page_type_scores.items():
    itemid_userid_df.loc[itemid_userid_df.index.isin(df_filtered[df_filtered['pageType'] == page_type].index), 'score'] += score

# Güncellenmiş veri çerçevesini gösterelim

itemid_userid_df.head()

Unnamed: 0,userId,itemId,score
0,0001d86ea81e6eef12cebaa1dcbdadc2,3fe466cbc67f4352be350f0c46bf2c2c,20
1,000a53fe09a2a3decd11b6b30d703b9c,1b9c9f89b863877545687dd4f2e60153,20
2,000a53fe09a2a3decd11b6b30d703b9c,1d5d28877bfd3f288be22468599e93f8,20
3,000a53fe09a2a3decd11b6b30d703b9c,6581b8a02f7c68b4a8bf794c16a0ac32,20
4,000a53fe09a2a3decd11b6b30d703b9c,7ce2ecaccdd217cb97c864701620461c,20


# MODEL SELECTION

In [37]:

def precision_at_k(r, k):
    """Precision at k"""
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def recall_at_k(r, k, all_pos_items):
    """Recall at k"""
    r = np.asarray(r)[:k] != 0
    return np.sum(r) / len(all_pos_items)

def average_precision(r, k):
    """Average precision at k"""
    r = np.asarray(r)[:k] != 0
    out = [precision_at_k(r, i + 1) for i in range(k) if r[i]]
    if not out:
        return 0.
    return np.mean(out)

def mean_average_precision(rs, k):
    """Mean average precision at k"""
    return np.mean([average_precision(r, k) for r in rs])

def ndcg_at_k(r, k, method=1):
    """Normalized discounted cumulative gain (NDCG) at k"""
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            dcg = r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            dcg = np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
        idcg = np.sum(1. / np.log2(np.arange(2, r.size + 2)))
        return dcg / idcg
    return 0.


## SVD

In [38]:

# Label encoder
user_le = LabelEncoder()
item_le = LabelEncoder()

# Kullanıcı ve item id'leri sayısal değerlere dönüştürme
itemid_userid_df['userId_le'] = user_le.fit_transform(itemid_userid_df['userId'])
itemid_userid_df['itemId_le'] = item_le.fit_transform(itemid_userid_df['itemId'])

# Train-test split
train_df, test_df = train_test_split(itemid_userid_df, test_size=0.2, random_state=42)

# Tüm kullanıcı ve öğe setini almak
all_users = np.union1d(train_df['userId_le'], test_df['userId_le'])
all_items = np.union1d(train_df['itemId_le'], test_df['itemId_le'])

# Kullanıcı ve öğe sayıları
num_users = len(all_users)
num_items = len(all_items)

# Kullanıcı ve öğe haritalama
user_map = {user: i for i, user in enumerate(all_users)}
item_map = {item: i for i, item in enumerate(all_items)}

# Train seti için matris oluşturma (veri tipi float)
train_row = train_df['userId_le'].map(user_map).values
train_col = train_df['itemId_le'].map(item_map).values
train_data = train_df['score'].values.astype(np.float64)
train_matrix = coo_matrix((train_data, (train_row, train_col)), shape=(num_users, num_items)).tocsr()

# Test seti için matris oluşturma (veri tipi float)
test_row = test_df['userId_le'].map(user_map).values
test_col = test_df['itemId_le'].map(item_map).values
test_data = test_df['score'].values.astype(np.float64)
test_matrix = coo_matrix((test_data, (test_row, test_col)), shape=(num_users, num_items)).tocsr()

# SVD ile matrisi ayrıştırma
u, s, vt = svds(train_matrix, k=50)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

# Modeli değerlendirme fonksiyonu
def evaluate_svd_model(predictions, test_matrix, k=10):
    user_count = test_matrix.shape[0]
    precision, recall, map_, ndcg = 0, 0, 0, 0

    for user_id in range(user_count):
        # Get true items for the user
        true_items = test_matrix[user_id].indices
        if len(true_items) == 0:
            continue

        # Get predicted items for the user
        user_ratings = predictions[user_id]
        top_items = np.argsort(-user_ratings)[:k]

        # Calculate relevance scores
        relevance = np.in1d(top_items, true_items).astype(int)

        # Calculate metrics
        precision += precision_at_k(relevance, k)
        recall += recall_at_k(relevance, k, true_items)
        map_ += average_precision(relevance, k)
        ndcg += ndcg_at_k(relevance, k)

    user_count_with_recommendations = np.sum([len(test_matrix[user_id].indices) > 0 for user_id in range(user_count)])
    return {
        'precision': precision / user_count_with_recommendations,
        'recall': recall / user_count_with_recommendations,
        'map': map_ / user_count_with_recommendations,
        'ndcg': ndcg / user_count_with_recommendations
    }

# Modeli değerlendirme
results = evaluate_svd_model(X_pred, test_matrix, k=10)
print("SVD Evaluation Results:", results)


SVD Evaluation Results: {'precision': 0.013488333053550093, 'recall': 0.06716848745775168, 'map': 0.041420813004663944, 'ndcg': 0.014327353267304754}


## BayesianPersonalizedRanking

In [39]:


# Label encoder
user_le = LabelEncoder()
item_le = LabelEncoder()

# Kullanıcı ve item id'leri sayısal değerlere dönüştürme
itemid_userid_df['userId_le'] = user_le.fit_transform(itemid_userid_df['userId'])
itemid_userid_df['itemId_le'] = item_le.fit_transform(itemid_userid_df['itemId'])

# Train-test split
train_df, test_df = train_test_split(itemid_userid_df, test_size=0.2, random_state=42)

# Tüm kullanıcı ve öğe setini almak
all_users = np.union1d(train_df['userId_le'], test_df['userId_le'])
all_items = np.union1d(train_df['itemId_le'], test_df['itemId_le'])

# Kullanıcı ve öğe sayıları
num_users = len(all_users)
num_items = len(all_items)

# Kullanıcı ve öğe haritalama
user_map = {user: i for i, user in enumerate(all_users)}
item_map = {item: i for i, item in enumerate(all_items)}

# Train seti için matris oluşturma (veri tipi float)
train_row = train_df['userId_le'].map(user_map).values
train_col = train_df['itemId_le'].map(item_map).values
train_data = train_df['score'].values.astype(np.float64)
train_matrix = coo_matrix((train_data, (train_row, train_col)), shape=(num_users, num_items)).tocsr()

# Test seti için matris oluşturma (veri tipi float)
test_row = test_df['userId_le'].map(user_map).values
test_col = test_df['itemId_le'].map(item_map).values
test_data = test_df['score'].values.astype(np.float64)
test_matrix = coo_matrix((test_data, (test_row, test_col)), shape=(num_users, num_items)).tocsr()

# BPR modelini oluşturma ve eğitme
model = implicit.bpr.BayesianPersonalizedRanking(factors=50, learning_rate=0.01, regularization=0.1, iterations=100)
model.fit(train_matrix)

# Modeli değerlendirme fonksiyonu
def evaluate_bpr_model(model, train_matrix, test_matrix, k=10):
    user_count = train_matrix.shape[0]
    precision, recall, map_, ndcg = 0, 0, 0, 0

    # Get predictions for all users
    user_items = model.recommend_all(train_matrix, N=k)

    for user_id in range(user_count):
        # Get true items for the user
        if user_id >= test_matrix.shape[0]:
            continue
        true_items = test_matrix[user_id].indices
        if len(true_items) == 0:
            continue

        # Get predicted items for the user
        predicted_items = user_items[user_id]

        # Calculate relevance scores
        relevance = np.in1d(predicted_items, true_items).astype(int)

        # Calculate metrics
        precision += precision_at_k(relevance, k)
        recall += recall_at_k(relevance, k, true_items)
        map_ += average_precision(relevance, k)
        ndcg += ndcg_at_k(relevance, k)

    user_count_with_recommendations = np.sum([len(test_matrix[user_id].indices) > 0 for user_id in range(user_count)])
    return {
        'precision': precision / user_count_with_recommendations,
        'recall': recall / user_count_with_recommendations,
        'map': map_ / user_count_with_recommendations,
        'ndcg': ndcg / user_count_with_recommendations
    }

# Modeli değerlendirme
results = evaluate_bpr_model(model, train_matrix, test_matrix, k=10)
print("BPR Evaluation Results:", results)


100%|██████████| 100/100 [00:02<00:00, 43.37it/s, train_auc=53.41%, skipped=1.29%]


BPR Evaluation Results: {'precision': 0.0011583011583011559, 'recall': 0.00422579747113909, 'map': 0.0035490792478370128, 'ndcg': 0.0012310090238375506}


## AlternatingLeastSquares

In [40]:


# Label encoder
user_le = LabelEncoder()
item_le = LabelEncoder()

# Kullanıcı ve item id'leri sayısal değerlere dönüştürme
itemid_userid_df['userId_le'] = user_le.fit_transform(itemid_userid_df['userId'])
itemid_userid_df['itemId_le'] = item_le.fit_transform(itemid_userid_df['itemId'])

# Train-test split
train_df, test_df = train_test_split(itemid_userid_df, test_size=0.2, random_state=42)

# Tüm kullanıcı ve öğe setini almak
all_users = np.union1d(train_df['userId_le'], test_df['userId_le'])
all_items = np.union1d(train_df['itemId_le'], test_df['itemId_le'])

# Kullanıcı ve öğe sayıları
num_users = len(all_users)
num_items = len(all_items)

# Kullanıcı ve öğe haritalama
user_map = {user: i for i, user in enumerate(all_users)}
item_map = {item: i for i, item in enumerate(all_items)}

# Train seti için matris oluşturma
train_row = train_df['userId_le'].map(user_map).values
train_col = train_df['itemId_le'].map(item_map).values
train_data = train_df['score'].values
train_matrix = coo_matrix((train_data, (train_row, train_col)), shape=(num_users, num_items)).tocsr()

# Test seti için matris oluşturma
test_row = test_df['userId_le'].map(user_map).values
test_col = test_df['itemId_le'].map(item_map).values
test_data = test_df['score'].values
test_matrix = coo_matrix((test_data, (test_row, test_col)), shape=(num_users, num_items)).tocsr()

# ALS modelini oluşturma ve eğitme
model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20, calculate_training_loss=True)
model.fit(train_matrix)


def evaluate_model(model, train_matrix, test_matrix, k=10):
    """Evaluate the model"""
    user_count = train_matrix.shape[0]
    precision, recall, map_, ndcg = 0, 0, 0, 0

    # Get predictions for all users
    user_items = model.recommend_all(train_matrix, N=k)
    
    for user_id in range(user_count):
        # Get true items for the user
        if user_id >= test_matrix.shape[0]:
            continue
        true_items = test_matrix[user_id].indices
        if len(true_items) == 0:
            continue

        # Get predicted items for the user
        predicted_items = user_items[user_id]

        # Calculate relevance scores
        relevance = np.in1d(predicted_items, true_items).astype(int)

        # Calculate metrics
        precision += precision_at_k(relevance, k)
        recall += recall_at_k(relevance, k, true_items)
        map_ += average_precision(relevance, k)
        ndcg += ndcg_at_k(relevance, k)

    user_count_with_recommendations = np.sum([len(test_matrix[user_id].indices) > 0 for user_id in range(user_count)])
    return {
        'precision': precision / user_count_with_recommendations,
        'recall': recall / user_count_with_recommendations,
        'map': map_ / user_count_with_recommendations,
        'ndcg': ndcg / user_count_with_recommendations
    }


# Modeli değerlendirme
results = evaluate_model(model, train_matrix, test_matrix, k=10)
print("Evaluation Results:", results)


  check_blas_config()
100%|██████████| 20/20 [00:12<00:00,  1.57it/s, loss=0.00407]


Evaluation Results: {'precision': 0.05930837669968601, 'recall': 0.302869757935464, 'map': 0.2246219849269978, 'ndcg': 0.07560038347051563}


### AlternatingLeastSquares diger modellere nazaran daha iyi sonuc verdigi icin simdi bu modeli tum veri ile kullanip kaydedecegim.

In [41]:



# Label encoder
user_le = LabelEncoder()
item_le = LabelEncoder()

# Kullanıcı ve item id'leri sayısal değerlere dönüştürme
itemid_userid_df['userId_le'] = user_le.fit_transform(itemid_userid_df['userId'])
itemid_userid_df['itemId_le'] = item_le.fit_transform(itemid_userid_df['itemId'])


# Train seti için matris oluşturma
train_matrix = coo_matrix((itemid_userid_df['score'].values, (itemid_userid_df['userId_le'].values, itemid_userid_df['itemId_le'].values)))
train_matrix_csr = train_matrix.tocsr()

# ALS modelini oluşturma ve eğitme
model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20, calculate_training_loss=True)
model.fit(train_matrix_csr)




100%|██████████| 20/20 [00:13<00:00,  1.48it/s, loss=0.00479]


In [42]:
user_id = "7670b27dcd2805736b5efb8e2ef06917"
user_id_le = user_le.transform([user_id])[0]
recommendations = model.recommend(user_id_le, train_matrix_csr[user_id_le], N=10)
print(recommendations)

(array([ 2694,  1829, 10005,  7681,    81,    57,  8547,   346,  4964,
        3484], dtype=int32), array([0.81407154, 0.7833661 , 0.7687343 , 0.7582731 , 0.72621644,
       0.68803245, 0.6803852 , 0.6726128 , 0.6705618 , 0.6308    ],
      dtype=float32))


In [43]:
# save model

with open('../models/implicit_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# save label encoders
with open('../models/user_le.pkl', 'wb') as f:
    pickle.dump(user_le, f)

with open('../models/item_le.pkl', 'wb') as f:
    pickle.dump(item_le, f)

# save train matrix

save_npz('../models/train_matrix.npz', train_matrix_csr)


