# Recommendation System

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('./train.csv')
test = pd.read_csv('./train.csv')

In [2]:
train['user_id'] = train['user_id'].astype(int)
test['user_id'] = test['user_id'].astype(int)

train['product_id'] = train['product_id'].astype(int)
test['product_id'] = test['product_id'].astype(int)

In [12]:
train.groupby('user_id').product_id.unique().explode()

user_id
1.0          2953.0
1.0          7656.0
2.0         25774.0
3.0         18177.0
3.0         21364.0
             ...   
99996.0     28478.0
99998.0     16938.0
99998.0     19164.0
99999.0     25959.0
100000.0    10017.0
Name: product_id, Length: 109809, dtype: object

In [29]:
# 희소행렬 (coo)
col = train.groupby('user_id').product_id.unique().explode().values
row = train.groupby('user_id').product_id.unique().explode().index.values

In [4]:
# csr
row_ind = train.groupby('user_id').product_id.nunique().cumsum().values
row_ind = np.insert(row_ind, 0, 0)

In [8]:
from scipy.sparse import csr_matrix

train_ = csr_matrix((np.ones(len(col)), col, row_ind))

In [164]:
col

array([2953, 7656, 25774, ..., 19164, 25959, 10017], dtype=object)

In [163]:
train_.indices

array([ 2953,  7656, 25774, ..., 19164, 25959, 10017])

In [165]:
row_ind

array([     0,      2,      3, ..., 109807, 109808, 109809])

In [145]:
train_.indptr

array([     0,      2,      3, ..., 109807, 109808, 109809])

In [149]:
user_list = train.groupby('user_id').product_id.unique().index

In [150]:
user_list

Index([     1,      2,      3,      4,      5,      7,      8,      9,     11,
           12,
       ...
        99985,  99986,  99988,  99989,  99990,  99993,  99996,  99998,  99999,
       100000],
      dtype='int64', name='user_id', length=76918)

In [11]:
# 유저별 구매내역 뽑기
gt_dict = test.groupby('user_id')['product_id'].unique().to_dict()

In [131]:
products = pd.read_csv('./Looker Ecommerce BigQuery Dataset/products.csv')
products = products.set_index('id')

In [160]:
def ndcg(y_true, y_pred, k=100):
    # DCG at k
    dcg = sum([int(p in y_true) / np.log2(i+2) for i, p in enumerate(y_pred)])

    # IDCG at k
    idcg = sum([1 / np.log2(i+2) for i in range(min(len(y_true), k))])
    
    # NDCG at k
    return dcg / idcg

# als implicit

In [207]:
#%pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp310-cp310-macosx_11_0_arm64.whl (761 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m761.4/761.4 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2
Note: you may need to restart the kernel to use updated packages.


- pivot 없이 csr 만들 수 있음!

In [10]:
import implicit

# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=50)

# train the model on a sparse matrix of user/item/confidence weights
model.fit(train_)

  from .autonotebook import tqdm as notebook_tqdm
  check_blas_config()
100%|██████████| 15/15 [00:34<00:00,  2.32s/it]


In [213]:
# recommend items for a user
user_idx = np.where(user_list==69611)
ids, score = model.recommend(user_idx, train_[user_idx], N=100, filter_already_liked_items=True)

In [214]:
train_[user_idx].indices

array([16677, 21384, 19990, 26932, 22804, 19460, 17962, 19040, 16704,
       27392, 21577], dtype=int32)

In [215]:
# user_id: 69611
pd.DataFrame({"product_id": ids[0], "product_name": products.loc[ids[0]]['name'], "recommend_score": score[0], "already_buyed": np.in1d(ids, train_[user_idx].indices)})

Unnamed: 0_level_0,product_id,product_name,recommend_score,already_buyed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
17112,17112,U.S. Polo Assn. Men's Hoody With Big Pony,0.159294,False
22447,22447,Original ratcatcher Moleskin Pants / Pleated L...,0.157544,False
24489,24489,STANCE Black Multi-Colored Polka Dot Mens Casu...,0.157512,False
23775,23775,ililily Slim-fit Varsity Jacket American Baseb...,0.145411,False
24753,24753,Dickies Men's Steel Toe Crew,0.143982,False
...,...,...,...,...
18219,18219,Carhartt Men's Big-Tall Midweight Hooded Sweat...,0.012573,False
22363,22363,Oneill Men's Contact Pant,0.012490,False
19243,19243,Allegra K Mens Deep V Neck Button Closure Casu...,0.012319,False
7793,7793,Bailey 44 Women's Gnarlatious Jacket,0.012240,False


In [220]:
users = pd.read_csv('./Looker Ecommerce BigQuery Dataset/users.csv')

In [221]:
users[users.id==69611]

Unnamed: 0,id,first_name,last_name,email,age,gender,state,street_address,postal_code,city,country,latitude,longitude,traffic_source,created_at
34405,69611,John,Holland,johnholland@example.net,52,M,Guangdong,9511 James Track Suite 026,516029,Chengdu,China,23.006373,114.325225,Search,2020-03-09 09:39:00+00:00


In [183]:
# 52세 중국 남성, 평균 결제 금액 62.12, 최저 금액: 19.98, 최고 금액: 139.9, 주문 횟수 12
train[train.user_id==69611]['name'].unique()

array(['Mens Modena Solid Silver French Cuff Dress Shirt',
       'Walls Mens Work Fire Retardant Denim 5 Pocket Jeans FR',
       'DKNY Mens 2 Button Black Check Slim Fit Wool Sport Coat Jacket',
       "Majestic International Men's Patrician Shawl Robe",
       "Oakley Men's Represent Short",
       'Geoffrey Beene Super Soft Cardigan',
       "RetroFit Men's Long Sleeve Pullover Hoodie Sweater White & Blue",
       "Nautica Men's Milano Bold Stripe Sweater",
       "Icebreaker Men's Short Sleeve Crewe Top",
       'Blackwatch tartan plaid check tie cord flannel cotton pants for lounging sleep sports. Unisex relaxed fit',
       "Men's 34 inch Inseam Red Kap Loose Fit Denim Jeans"], dtype=object)

In [227]:
train[train['name']=='Juicy Couture Womens Stripe Shorts Romper']

Unnamed: 0,user_id,product_id,status,age,gender,account_country,account_traffic_source,account_created_at,User_AvgPrice,User_MinPrice,User_MaxPrice,User_NumOrder,product_NumOrder,product_NumOrderUser,category,name,brand,department,product_name_tk
92456,16931,4140,Processing,66,F,Japan,Search,2024-01-16,72.83482,20.0,198.0,7,4,4,Jumpsuits & Rompers,Juicy Couture Womens Stripe Shorts Romper,Juicy Couture,Women,"['stripe', 'shorts']"


In [228]:
# find related items
ids, score = model.similar_items(4140, N=100)

In [229]:
pd.DataFrame({"product_id": ids,  "product_name": products.loc[ids]['name'], "related_score": score})

Unnamed: 0_level_0,product_id,product_name,related_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4140,4140,Juicy Couture Womens Stripe Shorts Romper,1.000000
15858,15858,collection eighteen Women's Super Ruffle Borde...,0.999973
3475,3475,KAMALIKULTURE Women's Long Sleeve Crewneck Max...,0.999486
7293,7293,Hard Tail A-line skirt (black),0.982009
11646,11646,Barely There Women's Barely There Flex To Fit/...,0.976617
...,...,...,...
7189,7189,Adar Mid-Calf Length Drawstring Skirt,0.849163
2528,2528,Champion Women's Seamless Fab And Fun Sports Bra,0.848978
11200,11200,Flexees Women's Decadence V-Neck Molded Slip,0.848604
10913,10913,Signature Lace Low Rise Thong 5-Pack,0.847870


In [158]:
# y_pred: 유저마다 추천 100개 리스트 뽑기 -> dict 형태로
pred_dict = dict()
for i, user_id in enumerate(user_list):
     #i = np.where(user_list==user_id)
     y_pred, _ = model.recommend(i, train_[i], N=100, filter_already_liked_items=False)
     pred_dict[user_id] = y_pred

In [161]:
als_ndcg = []
for user_id in user_list:
    als_ndcg.append(ndcg(gt_dict[user_id], pred_dict[user_id]))

In [162]:
np.mean(als_ndcg)

0.06157373356329583