In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from scipy.sparse import coo_matrix, csr_matrix, save_npz
from scipy.sparse.linalg import svds
import implicit
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle
import ast

# make pandas show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#reading dataset from parquet file
df = pd.read_parquet('../datasets/train.parquet')
df.head()

Unnamed: 0,date,userId,sessionId,pageType,itemId,category,productPrice,oldProductPrice
0,2019-08-05 19:30:37,00172f1d9a71e9a8de0aa34288a6b19b,e8167c23f8ac2f9be979c32380e0fc2b7e94941e917d30...,productDetail,83472aea4051c00d031b01ff42ef73fc,"[""kadın çanta"",""omuz askılı çanta""]",622.0,1220.0
1,2019-08-31 16:53:55,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,category,[],"[""seyahat samsonite"",""laptop çantası""]",,
2,2019-08-31 16:53:29,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,main,[],[],,
3,2019-08-31 16:53:43,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,category,[],"[""seyahat samsonite"",""laptop çantası""]",,
4,2019-08-31 16:54:13,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,productDetail,d6afa22ab475d41e7dc9b721f3f795ad,"[""seyahat samsonite"",""laptop çantası""]",389.0,389.0


Category kolonunu listeye ceviriyorum

In [3]:
df['category'] = df['category'].apply(ast.literal_eval)
df.head()

Unnamed: 0,date,userId,sessionId,pageType,itemId,category,productPrice,oldProductPrice
0,2019-08-05 19:30:37,00172f1d9a71e9a8de0aa34288a6b19b,e8167c23f8ac2f9be979c32380e0fc2b7e94941e917d30...,productDetail,83472aea4051c00d031b01ff42ef73fc,"[kadın çanta, omuz askılı çanta]",622.0,1220.0
1,2019-08-31 16:53:55,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,category,[],"[seyahat samsonite, laptop çantası]",,
2,2019-08-31 16:53:29,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,main,[],[],,
3,2019-08-31 16:53:43,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,category,[],"[seyahat samsonite, laptop çantası]",,
4,2019-08-31 16:54:13,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,productDetail,d6afa22ab475d41e7dc9b721f3f795ad,"[seyahat samsonite, laptop çantası]",389.0,389.0


itemid-userid pair dataframe'i olusturuyorum

In [4]:
itemid_userid_df = df.copy()
itemid_userid_df = itemid_userid_df[['userId', 'itemId', 'category']]
itemid_userid_df['score'] = 0
itemid_userid_df.head()

Unnamed: 0,userId,itemId,category,score
0,00172f1d9a71e9a8de0aa34288a6b19b,83472aea4051c00d031b01ff42ef73fc,"[kadın çanta, omuz askılı çanta]",0
1,00172f1d9a71e9a8de0aa34288a6b19b,[],"[seyahat samsonite, laptop çantası]",0
2,00172f1d9a71e9a8de0aa34288a6b19b,[],[],0
3,00172f1d9a71e9a8de0aa34288a6b19b,[],"[seyahat samsonite, laptop çantası]",0
4,00172f1d9a71e9a8de0aa34288a6b19b,d6afa22ab475d41e7dc9b721f3f795ad,"[seyahat samsonite, laptop çantası]",0


In [5]:


# itemId sütununu kontrol etme ve düzeltme
def process_item_id(item):
    if isinstance(item, str):
        if item.startswith('[') and item.endswith(']'):
            return literal_eval(item)
        elif item == '[]':
            return []
        else:
            return [item]
    return item

itemid_userid_df['itemId'] = itemid_userid_df['itemId'].apply(process_item_id)

# Boş itemId'leri içeren satırları filtreleme
itemid_userid_df = itemid_userid_df[itemid_userid_df['itemId'].apply(lambda x: len(x) > 0)]

# Çoklu itemId'leri ayrı satırlara genişletme
itemid_userid_df = itemid_userid_df.explode('itemId').reset_index(drop=True)

# Duplikateleri kaldırma
itemid_userid_df = itemid_userid_df.drop_duplicates(subset=['userId', 'itemId']).reset_index(drop=True)

# Sonucu kontrol etme
itemid_userid_df.head()

Unnamed: 0,userId,itemId,category,score
0,00172f1d9a71e9a8de0aa34288a6b19b,83472aea4051c00d031b01ff42ef73fc,"[kadın çanta, omuz askılı çanta]",0
1,00172f1d9a71e9a8de0aa34288a6b19b,d6afa22ab475d41e7dc9b721f3f795ad,"[seyahat samsonite, laptop çantası]",0
2,02912533de5da26ffac47a2cbb31d2f3,1d84ddc6c6224402a845c0b5c684335b,"[erkek ayakkabı, spor ayakkabı]",0
3,02912533de5da26ffac47a2cbb31d2f3,2a411dd5f3ffb793a160235d5eb4a881,"[erkek ayakkabı, günlük ayakkabı]",0
4,02912533de5da26ffac47a2cbb31d2f3,9197e3dfdf3da36a2f55c5bc9300528e,"[erkek ayakkabı, günlük ayakkabı]",0


simdi skorlari hesaplayacagim

ilk olarak pageType==category iken dogal olarak itemid gelmiyor fakat category geliyor, bu yuzden eger item ile ilgili category incelendiyse skor ekliyorum

In [6]:
df_only_category = df[df['pageType'] == 'category']
df_only_category.head()

Unnamed: 0,date,userId,sessionId,pageType,itemId,category,productPrice,oldProductPrice
1,2019-08-31 16:53:55,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,category,[],"[seyahat samsonite, laptop çantası]",,
3,2019-08-31 16:53:43,00172f1d9a71e9a8de0aa34288a6b19b,c7f54acdf56e2d7539ffa59107b9017c2a8164495df909...,category,[],"[seyahat samsonite, laptop çantası]",,
5,2019-08-21 12:59:24,02912533de5da26ffac47a2cbb31d2f3,9b5910ad23389740691e1e26c2507debd44715861cd57f...,category,[],"[erkek ayakkabı, günlük ayakkabı]",,
6,2019-08-21 12:58:17,02912533de5da26ffac47a2cbb31d2f3,9b5910ad23389740691e1e26c2507debd44715861cd57f...,category,[],[erkek ayakkabı],,
7,2019-08-21 12:59:57,02912533de5da26ffac47a2cbb31d2f3,9b5910ad23389740691e1e26c2507debd44715861cd57f...,category,[],"[erkek ayakkabı, günlük ayakkabı]",,


In [7]:
# Her kullanıcı ve kategori çiftinin kaç kez tekrarlandığını hesaplayalım
user_category_counts = df_only_category.explode('category').groupby(['userId', 'category']).size().reset_index(name='count')

# itemid_userid_df ile user_category_counts'ı birleştir
itemid_userid_df = itemid_userid_df.explode('category')
itemid_userid_df = itemid_userid_df.merge(user_category_counts, on=['userId', 'category'], how='left')

# Skorları güncelleme
itemid_userid_df['count'] = itemid_userid_df['count'].fillna(0)
itemid_userid_df['score'] += itemid_userid_df['count'].astype(int)

# Gereksiz sütunları kaldırma ve itemId'leri tekrar gruplama
itemid_userid_df = itemid_userid_df.drop(columns=['category', 'count']).drop_duplicates()
itemid_userid_df = itemid_userid_df.groupby(['userId', 'itemId']).sum().reset_index()

# Güncellenmiş veri çerçevesini gösterelim

itemid_userid_df.head()


Unnamed: 0,userId,itemId,score
0,0001d86ea81e6eef12cebaa1dcbdadc2,3fe466cbc67f4352be350f0c46bf2c2c,0
1,000a53fe09a2a3decd11b6b30d703b9c,1b9c9f89b863877545687dd4f2e60153,0
2,000a53fe09a2a3decd11b6b30d703b9c,1d5d28877bfd3f288be22468599e93f8,0
3,000a53fe09a2a3decd11b6b30d703b9c,6581b8a02f7c68b4a8bf794c16a0ac32,0
4,000a53fe09a2a3decd11b6b30d703b9c,7ce2ecaccdd217cb97c864701620461c,0


kullanicinin urune direk etkilesimlerini skorlandiriyorum

In [8]:
# pageType'lara göre skorları arttırma
page_type_scores = {
    'productDetail': 20,
    'cart': 100,
    'success': 500
}

# df'yi itemid_userid_df ile birleştirip pageType'a göre skorları güncelleyelim
df_filtered = df[df['pageType'].isin(page_type_scores.keys())]
df_filtered = df_filtered.merge(itemid_userid_df, on=['userId', 'itemId'], how='inner')

for page_type, score in page_type_scores.items():
    itemid_userid_df.loc[itemid_userid_df.index.isin(df_filtered[df_filtered['pageType'] == page_type].index), 'score'] += score

# Güncellenmiş veri çerçevesini gösterelim

itemid_userid_df.head()

Unnamed: 0,userId,itemId,score
0,0001d86ea81e6eef12cebaa1dcbdadc2,3fe466cbc67f4352be350f0c46bf2c2c,20
1,000a53fe09a2a3decd11b6b30d703b9c,1b9c9f89b863877545687dd4f2e60153,20
2,000a53fe09a2a3decd11b6b30d703b9c,1d5d28877bfd3f288be22468599e93f8,20
3,000a53fe09a2a3decd11b6b30d703b9c,6581b8a02f7c68b4a8bf794c16a0ac32,20
4,000a53fe09a2a3decd11b6b30d703b9c,7ce2ecaccdd217cb97c864701620461c,20


# MODEL SELECTION

In [9]:

def precision_at_k(r, k):
    """Precision at k"""
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def recall_at_k(r, k, all_pos_items):
    """Recall at k"""
    r = np.asarray(r)[:k] != 0
    return np.sum(r) / len(all_pos_items)

def average_precision(r, k):
    """Average precision at k"""
    r = np.asarray(r)[:k] != 0
    out = [precision_at_k(r, i + 1) for i in range(k) if r[i]]
    if not out:
        return 0.
    return np.mean(out)

def mean_average_precision(rs, k):
    """Mean average precision at k"""
    return np.mean([average_precision(r, k) for r in rs])

def ndcg_at_k(r, k, method=1):
    """Normalized discounted cumulative gain (NDCG) at k"""
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            dcg = r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            dcg = np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
        idcg = np.sum(1. / np.log2(np.arange(2, r.size + 2)))
        return dcg / idcg
    return 0.


## SVD

In [10]:

# Label encoder
user_le = LabelEncoder()
item_le = LabelEncoder()

# Kullanıcı ve item id'leri sayısal değerlere dönüştürme
itemid_userid_df['userId_le'] = user_le.fit_transform(itemid_userid_df['userId'])
itemid_userid_df['itemId_le'] = item_le.fit_transform(itemid_userid_df['itemId'])

# Train-test split
train_df, test_df = train_test_split(itemid_userid_df, test_size=0.2, random_state=42)

# Tüm kullanıcı ve öğe setini almak
all_users = np.union1d(train_df['userId_le'], test_df['userId_le'])
all_items = np.union1d(train_df['itemId_le'], test_df['itemId_le'])

# Kullanıcı ve öğe sayıları
num_users = len(all_users)
num_items = len(all_items)

# Kullanıcı ve öğe haritalama
user_map = {user: i for i, user in enumerate(all_users)}
item_map = {item: i for i, item in enumerate(all_items)}

# Train seti için matris oluşturma (veri tipi float)
train_row = train_df['userId_le'].map(user_map).values
train_col = train_df['itemId_le'].map(item_map).values
train_data = train_df['score'].values.astype(np.float64)
train_matrix = coo_matrix((train_data, (train_row, train_col)), shape=(num_users, num_items)).tocsr()

# Test seti için matris oluşturma (veri tipi float)
test_row = test_df['userId_le'].map(user_map).values
test_col = test_df['itemId_le'].map(item_map).values
test_data = test_df['score'].values.astype(np.float64)
test_matrix = coo_matrix((test_data, (test_row, test_col)), shape=(num_users, num_items)).tocsr()

# SVD ile matrisi ayrıştırma
u, s, vt = svds(train_matrix, k=50)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

# Modeli değerlendirme fonksiyonu
def evaluate_svd_model(predictions, test_matrix, k=10):
    user_count = test_matrix.shape[0]
    precision, recall, map_, ndcg = 0, 0, 0, 0

    for user_id in range(user_count):
        # Get true items for the user
        true_items = test_matrix[user_id].indices
        if len(true_items) == 0:
            continue

        # Get predicted items for the user
        user_ratings = predictions[user_id]
        top_items = np.argsort(-user_ratings)[:k]

        # Calculate relevance scores
        relevance = np.in1d(top_items, true_items).astype(int)

        # Calculate metrics
        precision += precision_at_k(relevance, k)
        recall += recall_at_k(relevance, k, true_items)
        map_ += average_precision(relevance, k)
        ndcg += ndcg_at_k(relevance, k)

    user_count_with_recommendations = np.sum([len(test_matrix[user_id].indices) > 0 for user_id in range(user_count)])
    return {
        'precision': precision / user_count_with_recommendations,
        'recall': recall / user_count_with_recommendations,
        'map': map_ / user_count_with_recommendations,
        'ndcg': ndcg / user_count_with_recommendations
    }

# Modeli değerlendirme
results = evaluate_svd_model(X_pred, test_matrix, k=10)
print("SVD Evaluation Results:", results)


SVD Evaluation Results: {'precision': 0.013435700575815395, 'recall': 0.06733554794309383, 'map': 0.03886342291791674, 'ndcg': 0.013893162069084267}


## BayesianPersonalizedRanking

In [11]:


# Label encoder
user_le = LabelEncoder()
item_le = LabelEncoder()

# Kullanıcı ve item id'leri sayısal değerlere dönüştürme
itemid_userid_df['userId_le'] = user_le.fit_transform(itemid_userid_df['userId'])
itemid_userid_df['itemId_le'] = item_le.fit_transform(itemid_userid_df['itemId'])

# Train-test split
train_df, test_df = train_test_split(itemid_userid_df, test_size=0.2, random_state=42)

# Tüm kullanıcı ve öğe setini almak
all_users = np.union1d(train_df['userId_le'], test_df['userId_le'])
all_items = np.union1d(train_df['itemId_le'], test_df['itemId_le'])

# Kullanıcı ve öğe sayıları
num_users = len(all_users)
num_items = len(all_items)

# Kullanıcı ve öğe haritalama
user_map = {user: i for i, user in enumerate(all_users)}
item_map = {item: i for i, item in enumerate(all_items)}

# Train seti için matris oluşturma (veri tipi float)
train_row = train_df['userId_le'].map(user_map).values
train_col = train_df['itemId_le'].map(item_map).values
train_data = train_df['score'].values.astype(np.float64)
train_matrix = coo_matrix((train_data, (train_row, train_col)), shape=(num_users, num_items)).tocsr()

# Test seti için matris oluşturma (veri tipi float)
test_row = test_df['userId_le'].map(user_map).values
test_col = test_df['itemId_le'].map(item_map).values
test_data = test_df['score'].values.astype(np.float64)
test_matrix = coo_matrix((test_data, (test_row, test_col)), shape=(num_users, num_items)).tocsr()

# BPR modelini oluşturma ve eğitme
model = implicit.bpr.BayesianPersonalizedRanking(factors=50, learning_rate=0.01, regularization=0.1, iterations=100)
model.fit(train_matrix)

# Modeli değerlendirme fonksiyonu
def evaluate_bpr_model(model, train_matrix, test_matrix, k=10):
    user_count = train_matrix.shape[0]
    precision, recall, map_, ndcg = 0, 0, 0, 0

    # Get predictions for all users
    user_items = model.recommend_all(train_matrix, N=k)

    for user_id in range(user_count):
        # Get true items for the user
        if user_id >= test_matrix.shape[0]:
            continue
        true_items = test_matrix[user_id].indices
        if len(true_items) == 0:
            continue

        # Get predicted items for the user
        predicted_items = user_items[user_id]

        # Calculate relevance scores
        relevance = np.in1d(predicted_items, true_items).astype(int)

        # Calculate metrics
        precision += precision_at_k(relevance, k)
        recall += recall_at_k(relevance, k, true_items)
        map_ += average_precision(relevance, k)
        ndcg += ndcg_at_k(relevance, k)

    user_count_with_recommendations = np.sum([len(test_matrix[user_id].indices) > 0 for user_id in range(user_count)])
    return {
        'precision': precision / user_count_with_recommendations,
        'recall': recall / user_count_with_recommendations,
        'map': map_ / user_count_with_recommendations,
        'ndcg': ndcg / user_count_with_recommendations
    }

# Modeli değerlendirme
results = evaluate_bpr_model(model, train_matrix, test_matrix, k=10)
print("BPR Evaluation Results:", results)


100%|██████████| 100/100 [00:02<00:00, 41.48it/s, train_auc=54.02%, skipped=1.31%]


BPR Evaluation Results: {'precision': 0.0008011349411666512, 'recall': 0.0023122500250603, 'map': 0.0022317992817867637, 'ndcg': 0.0008265384174353878}


## AlternatingLeastSquares

In [12]:


# Label encoder
user_le = LabelEncoder()
item_le = LabelEncoder()

# Kullanıcı ve item id'leri sayısal değerlere dönüştürme
itemid_userid_df['userId_le'] = user_le.fit_transform(itemid_userid_df['userId'])
itemid_userid_df['itemId_le'] = item_le.fit_transform(itemid_userid_df['itemId'])

# Train-test split
train_df, test_df = train_test_split(itemid_userid_df, test_size=0.2, random_state=42)

# Tüm kullanıcı ve öğe setini almak
all_users = np.union1d(train_df['userId_le'], test_df['userId_le'])
all_items = np.union1d(train_df['itemId_le'], test_df['itemId_le'])

# Kullanıcı ve öğe sayıları
num_users = len(all_users)
num_items = len(all_items)

# Kullanıcı ve öğe haritalama
user_map = {user: i for i, user in enumerate(all_users)}
item_map = {item: i for i, item in enumerate(all_items)}

# Train seti için matris oluşturma
train_row = train_df['userId_le'].map(user_map).values
train_col = train_df['itemId_le'].map(item_map).values
train_data = train_df['score'].values
train_matrix = coo_matrix((train_data, (train_row, train_col)), shape=(num_users, num_items)).tocsr()

# Test seti için matris oluşturma
test_row = test_df['userId_le'].map(user_map).values
test_col = test_df['itemId_le'].map(item_map).values
test_data = test_df['score'].values
test_matrix = coo_matrix((test_data, (test_row, test_col)), shape=(num_users, num_items)).tocsr()

# ALS modelini oluşturma ve eğitme
model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20, calculate_training_loss=True)
model.fit(train_matrix)


def evaluate_model(model, train_matrix, test_matrix, k=10):
    """Evaluate the model"""
    user_count = train_matrix.shape[0]
    precision, recall, map_, ndcg = 0, 0, 0, 0

    # Get predictions for all users
    user_items = model.recommend_all(train_matrix, N=k)
    
    for user_id in range(user_count):
        # Get true items for the user
        if user_id >= test_matrix.shape[0]:
            continue
        true_items = test_matrix[user_id].indices
        if len(true_items) == 0:
            continue

        # Get predicted items for the user
        predicted_items = user_items[user_id]

        # Calculate relevance scores
        relevance = np.in1d(predicted_items, true_items).astype(int)

        # Calculate metrics
        precision += precision_at_k(relevance, k)
        recall += recall_at_k(relevance, k, true_items)
        map_ += average_precision(relevance, k)
        ndcg += ndcg_at_k(relevance, k)

    user_count_with_recommendations = np.sum([len(test_matrix[user_id].indices) > 0 for user_id in range(user_count)])
    return {
        'precision': precision / user_count_with_recommendations,
        'recall': recall / user_count_with_recommendations,
        'map': map_ / user_count_with_recommendations,
        'ndcg': ndcg / user_count_with_recommendations
    }


# Modeli değerlendirme
results = evaluate_model(model, train_matrix, test_matrix, k=10)
print("Evaluation Results:", results)


  check_blas_config()
100%|██████████| 20/20 [00:09<00:00,  2.03it/s, loss=0.00407]


Evaluation Results: {'precision': 0.058958524576487974, 'recall': 0.3031938682443331, 'map': 0.22205922749762624, 'ndcg': 0.0750239684267244}


### AlternatingLeastSquares diger modellere nazaran daha iyi sonuc verdigi icin simdi bu modeli tum veri ile kullanip kaydedecegim.

In [13]:



# Label encoder
user_le = LabelEncoder()
item_le = LabelEncoder()

# Kullanıcı ve item id'leri sayısal değerlere dönüştürme
itemid_userid_df['userId_le'] = user_le.fit_transform(itemid_userid_df['userId'])
itemid_userid_df['itemId_le'] = item_le.fit_transform(itemid_userid_df['itemId'])


# Train seti için matris oluşturma
train_matrix = coo_matrix((itemid_userid_df['score'].values, (itemid_userid_df['userId_le'].values, itemid_userid_df['itemId_le'].values)))
train_matrix_csr = train_matrix.tocsr()

# ALS modelini oluşturma ve eğitme
model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20, calculate_training_loss=True)
model.fit(train_matrix_csr)




100%|██████████| 20/20 [00:10<00:00,  1.82it/s, loss=0.00478]


In [14]:
user_id = "7670b27dcd2805736b5efb8e2ef06917"
user_id_le = user_le.transform([user_id])[0]
recommendations = model.recommend(user_id_le, train_matrix_csr[user_id_le], N=10)
print(recommendations)

(array([ 2694,  1829,  7683, 10007,    81,  8549,    57,   346,  4965,
        5753], dtype=int32), array([0.84656054, 0.786134  , 0.77511185, 0.76166   , 0.74773   ,
       0.7383635 , 0.68076134, 0.67899585, 0.6456708 , 0.6448418 ],
      dtype=float32))


In [15]:
# save model

with open('../models/implicit_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# save label encoders
with open('../models/user_le.pkl', 'wb') as f:
    pickle.dump(user_le, f)

with open('../models/item_le.pkl', 'wb') as f:
    pickle.dump(item_le, f)

# save train matrix

save_npz('../models/train_matrix.npz', train_matrix_csr)


