# New data prep

In [156]:
import pandas as pd
import numpy as np
#!pip install implisit
import implicit

In [157]:
data = pd.read_csv('transactions_train.csv')
art = pd.read_csv('articles.csv')

user_item=data[['customer_id','article_id']]
# users=data['customer_id']
items=art[['article_id','product_type_no','colour_group_code']]

#берем только тех users, кто сделал более 10 покупок
user_ids_more_10 = data.groupby(['customer_id'])['article_id'].size().pipe(lambda x: x.index[x>10])
final_data = user_item.query('customer_id in @user_ids_more_10')

#считаем, сколько раз был куплен товар
d1=final_data.groupby(['customer_id', 'article_id']).size()
data_to_surp = pd.DataFrame(d1).reset_index()  
data_to_surp.rename(columns={ data_to_surp.columns[2]: "ratings" },inplace=True)

def score(x):
    if x > 4:
        return 3
    elif x > 1:
        return 2
    elif x == 1:
        return 1
    else:
        return 0
    
data_to_surp['new_ratings']=data_to_surp.ratings.apply(score)
data_to_surp.drop('ratings', axis=1, inplace=True)

cut_data=data_to_surp#.loc[:8000000]

reduce train data because of computational cost

In [160]:
data_prepared_init = cut_data.copy().loc[:80000]

data_prepared = data_prepared_init.copy()
data_prepared

Unnamed: 0,customer_id,article_id,new_ratings
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,176209023,1
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006,2
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043,1
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,607642008,1
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,625548001,1
...,...,...,...
79996,00d385d381a59688ef1f9fbeef053cdc4a780f65a1f0ed...,672127004,1
79997,00d385d381a59688ef1f9fbeef053cdc4a780f65a1f0ed...,677299002,1
79998,00d385d381a59688ef1f9fbeef053cdc4a780f65a1f0ed...,683020004,1
79999,00d385d381a59688ef1f9fbeef053cdc4a780f65a1f0ed...,684209014,1


## Reindex data

In [161]:
customer_reindex_dict = {}
for i, user_id in enumerate(data_prepared['customer_id'].unique()):
  customer_reindex_dict[user_id] = i

article_reindex_dict = {}
for j, item_id in enumerate(data_prepared['article_id'].unique()):
  article_reindex_dict[item_id] = j

article_reindex_dict_inverse = {}
for j, item_id in enumerate(data_prepared['article_id'].unique()):
  article_reindex_dict_inverse[j] = item_id

In [165]:
for ind in data_prepared.index:
    data_prepared.at[ind, 'customer_id'] = customer_reindex_dict[data_prepared.loc[ind]['customer_id']]
    data_prepared.at[ind, 'article_id'] = article_reindex_dict[data_prepared.loc[ind]['article_id']]

In [168]:
data_prepared

Unnamed: 0,customer_id,article_id,new_ratings
0,0,0,1
1,0,1,2
2,0,2,1
3,0,3,1
4,0,4,1
...,...,...,...
79996,2057,10326,1
79997,2057,31486,1
79998,2057,11770,1
79999,2057,532,1


## Constructing matrix

In [169]:
from scipy.sparse import csr_matrix

m=csr_matrix((data_prepared['new_ratings'].values, (data_prepared['article_id'], data_prepared['customer_id'])))

In [170]:
from implicit.nearest_neighbours import bm25_weight

# weight the matrix, both to reduce impact of users that have purchased the same items thousands of times
# and to reduce the weight given to popular items
items_user_purchase = bm25_weight(m, K1=100, B=0.8)

# get the transpose since the most of the functions in implicit expect (user, item) sparse matrices instead of (item, user)
user_items = items_user_purchase.T.tocsr()

In [178]:
#user_items.shape
#data_prepared['customer_id'].nunique()

2058

## Training algorithms

In [179]:
from implicit.als import AlternatingLeastSquares

model_als = AlternatingLeastSquares(factors=64, regularization=0.05)
model_als.fit(user_items)

  0%|          | 0/15 [00:00<?, ?it/s]

In [180]:
from implicit.lmf import LogisticMatrixFactorization

model_lmf = LogisticMatrixFactorization(factors=64, regularization=0.05)
model_lmf.fit(user_items)

  0%|          | 0/30 [00:00<?, ?it/s]

In [181]:
from implicit.bpr import BayesianPersonalizedRanking 

model_bpr = BayesianPersonalizedRanking(factors=64, regularization=0.05)
model_bpr.fit(user_items)

  0%|          | 0/100 [00:00<?, ?it/s]

In [251]:
from implicit.nearest_neighbours import ItemItemRecommender 

model_knn = ItemItemRecommender(K=20, num_threads=-1)
model_knn.fit(user_items)

  0%|          | 0/31487 [00:00<?, ?it/s]

## Test data loading and predicting

In [252]:
data_test = pd.read_csv('sample_submission.csv')

i_no_drop = list(customer_reindex_dict.keys())

data_test_droped = data_test.copy()
data_test_droped = data_test_droped.drop(index = data_test_droped.query('customer_id != @i_no_drop').index)

for ind in data_test_droped.index:
    data_test_droped.at[ind, 'customer_id'] = customer_reindex_dict[data_test_droped.loc[ind]['customer_id']]

In [253]:
data_test_droped

Unnamed: 0,customer_id,prediction
0,0,0706016001 0706016002 0372860001 0610776002 07...
1,1,0706016001 0706016002 0372860001 0610776002 07...
2,2,0706016001 0706016002 0372860001 0610776002 07...
4,3,0706016001 0706016002 0372860001 0610776002 07...
7,4,0706016001 0706016002 0372860001 0610776002 07...
...,...,...
4385,2053,0706016001 0706016002 0372860001 0610776002 07...
4386,2054,0706016001 0706016002 0372860001 0610776002 07...
4388,2055,0706016001 0706016002 0372860001 0610776002 07...
4389,2056,0706016001 0706016002 0372860001 0610776002 07...


In [254]:
recoms_for_models = []
models = [model_als, model_lmf, model_bpr, model_knn]

for model in models:
    recom = []
    for userid in data_test_droped['customer_id']:

        ids, scores = model.recommend(userid, user_items[userid], N=12, filter_already_liked_items=False)
        id_s=[]
        for id_ in ids:
            id_s.append(article_reindex_dict_inverse[id_])
        recom.append(id_s) 
    recoms_for_models.append(recom)

### For users who are not in train recommend 12 most popular items

In [255]:
popularity = data_prepared_init.groupby('article_id')['new_ratings'].count()

popularity_dict = dict(popularity)
top_pop = sorted(popularity_dict.items(), key=lambda x:x[1], reverse=True)[:12]
top_pop
most_popular = []
for top in top_pop:
    most_popular.append(top[0])

In [256]:
most_popular

[706016001,
 372860001,
 706016002,
 610776002,
 464297007,
 759871002,
 372860002,
 673677002,
 706016003,
 751471001,
 562245046,
 673396002]

# Constructing csv

In [257]:
result_csv = pd.read_csv('sample_submission.csv')

# i = 0,1,2,3
# 0 - AlternatingLeastSquares
# 1 - LogisticMatrixFactorization
# 2 - BayesianPersonalizedRanking
# 3 - KNN

i=3
for ind in result_csv.index:
    user = result_csv.loc[ind]['customer_id']
    
    if user in list(customer_reindex_dict.keys()):
        user_reind = customer_reindex_dict[user]
        
        rec_str=''
        for item in list(recoms_for_models[i][user_reind]):
            rec_str += str(item)+' '
        
        result_csv.at[ind, 'prediction'] = rec_str
        
    else:
        rec_str=''
        for item in most_popular:
            rec_str += str(item)+' '
        
        result_csv.at[ind, 'prediction'] = rec_str

In [258]:
result_csv

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,887593002 859416011 176209023 607642008 890498...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,745006005 631902006 583558001 640251009 639677...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,870304002 851400006 578020002 750424014 852643...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,706016001 372860001 706016002 610776002 464297...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,896152002 698286003 677049001 634249005 707704...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,706016001 372860001 706016002 610776002 464297...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,706016001 372860001 706016002 610776002 464297...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,706016001 372860001 706016002 610776002 464297...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,706016001 372860001 706016002 610776002 464297...


In [259]:
result_csv.to_csv('KNN_20_corrected_new_data_preproc.csv', index=False)