# __Course project on recommendation systems__

### Importing modules and libraries

In [1]:
import pandas as pd
import numpy as np
import os, sys, itertools
import warnings
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2
warnings.filterwarnings('ignore')

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Для работы pyspark с JAVA.JDK и работы самого pyspark
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-19"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "\bin:" + os.environ["PATH"]
os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# Для работы с pyspark
import pyspark.sql.functions as sf
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

from lightgbm import LGBMClassifier


# Импорт MainRecommender, prefilter_items, precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.metrics import precision_at_k, recall_at_k, reciprocal_rank_at_k, ndcg_at_k, ap_k
from src.recommenders import MainRecommender

### Uploading a train data set. Column processing. Train-validate split. Uploading a test data set.

In [2]:
data = pd.read_csv('../Lesson_2/retail_train.csv')
item_features = pd.read_csv('../Lesson_2/product.csv')
user_features = pd.read_csv('../Lesson_2/hh_demographic.csv')


# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train val split and test
test_size_weeks = 3
data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_val = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]
data_test = pd.read_csv('../Course_Project/retail_test.csv')

data_val = data_val[data_val['item_id'].isin(data_train['item_id'].unique())]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
result = data_val.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


## Pre-filtering of the training data set

In [4]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, 5000, item_features)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


## PySpark Collaborative Filtering with ALS (building a recommender system that utilises matrix factorisation technique — Alternating Least Squares)

In [5]:
session = (
        SparkSession.builder.config("spark.driver.memory", "4g")
        .config("spark.sql.shuffle.partitions", "100")
        .config("spark.driver.bindAddress", "127.0.0.1")
        .config("spark.driver.host", "localhost")
        .master("local[*]")
        .enableHiveSupport()
        .getOrCreate()
    )

In [6]:
session

In [7]:
spark_data_train=session.createDataFrame(data_train[["user_id","item_id","quantity"]])

In [8]:
spark_data_train= spark_data_train.withColumnRenamed("quantity","relevance")

In [9]:
spark_data_train.show(10)

+-------+-------+---------+
|user_id|item_id|relevance|
+-------+-------+---------+
|   2375|1085983|        1|
|   1364| 999999|        1|
|   1364| 999999|        1|
|   1364| 999999|        1|
|   1364| 937406|        1|
|   1172| 999999|        1|
|   1172| 999999|        1|
|   1172|1000493|        1|
|   1172| 999999|        1|
|   1172|1075214|        1|
+-------+-------+---------+
only showing top 10 rows



In [10]:
model = ALS(
            rank=30,
            userCol="user_id",
            itemCol="item_id",
            ratingCol="relevance",
            maxIter = 10,
            alpha = 1.0,
            regParam = 0.1,
            implicitPrefs=True,
            seed=42,
            coldStartStrategy="drop",
        ).fit(spark_data_train)

In [11]:
model.userFactors.show()

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[-0.083326, -0.39...|
| 20|[0.018956583, -0....|
| 30|[-0.1255144, -0.3...|
| 40|[-0.059964012, -0...|
| 50|[-0.10513571, -0....|
| 60|[-0.12480895, -0....|
| 70|[-0.046732277, -0...|
| 80|[-0.19498536, -0....|
| 90|[-0.06439288, -0....|
|100|[-0.11894906, -0....|
|110|[-0.12321144, -0....|
|120|[-0.108534805, -0...|
|130|[-0.15898412, -0....|
|140|[-0.202417, -0.45...|
|150|[-0.06639387, -0....|
|160|[-0.13902025, -0....|
|170|[-0.07212223, -0....|
|180|[-0.03326662, -0....|
|190|[-0.0904128, -0.4...|
|200|[-0.16828944, -0....|
+---+--------------------+
only showing top 20 rows



In [12]:
recs_als = model.recommendForAllUsers(6)

In [13]:
recs_als.count()

2497

In [14]:
recs_als.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|      4|[{999999, 1.47515...|
|      7|[{999999, 1.80323...|
|      8|[{999999, 1.77219...|
|     23|[{999999, 1.78213...|
|     26|[{999999, 1.57035...|
|     27|[{963686, 1.73339...|
|     28|[{999999, 1.77852...|
|     31|[{999999, 1.74920...|
|     34|[{999999, 1.35184...|
|     39|[{999999, 1.74899...|
|     44|[{999999, 1.52944...|
|     49|[{999999, 1.77759...|
|     51|[{999999, 1.69728...|
|     53|[{999999, 1.79796...|
|     55|[{999999, 1.64984...|
|     59|[{999999, 1.61712...|
|     63|[{999999, 1.62613...|
|     65|[{999999, 1.80920...|
|     69|[{999999, 1.66812...|
|     76|[{999999, 1.36990...|
+-------+--------------------+
only showing top 20 rows



In [15]:
recs_als=(recs_als.withColumn(
                    "recommendations", sf.explode("recommendations")
                )
                .withColumn("item_id", sf.col("recommendations.item_id"))
                .withColumn(
                    "relevance",
                    sf.col("recommendations.rating").cast(DoubleType()),
                )
                .select("user_id", "item_id", "relevance")
            )

In [16]:
recs_als.show()

+-------+-------+------------------+
|user_id|item_id|         relevance|
+-------+-------+------------------+
|      4| 999999|1.4751536846160889|
|      4|1029743|1.1661087274551392|
|      4|1106523|1.0199055671691895|
|      4|5569230|0.9501182436943054|
|      4|1075368|0.8939072489738464|
|      4| 916122|0.7827808856964111|
|      7| 999999|1.8032310009002686|
|      7|1029743|1.4123601913452148|
|      7|1106523| 1.285283088684082|
|      7| 916122|1.0510060787200928|
|      7|1126899| 0.986077606678009|
|      7| 866211|0.9067664742469788|
|      8| 999999|1.7721980810165405|
|      8|1029743|1.4981247186660767|
|      8|1106523|1.3509777784347534|
|      8| 916122| 1.225075602531433|
|      8|5569230|1.1809536218643188|
|      8| 844179|1.1787968873977661|
|     23| 999999|1.7821383476257324|
|     23|1029743|1.3663817644119263|
+-------+-------+------------------+
only showing top 20 rows



In [17]:
recs_all = pd.concat([recs_als.toPandas().groupby('user_id')['relevance'].unique().reset_index(), 
                        pd.DataFrame(pd.DataFrame(recs_als.toPandas().groupby('user_id')['item_id'].
                                                  unique().reset_index()['item_id'].values.tolist()).
                                     drop(0, axis=1).apply(lambda x: list(x), axis=1), columns=['rec_item'])], 
                       axis=1).reindex(columns=['user_id', 'rec_item', 'relevance'])

In [18]:
recs_all

Unnamed: 0,user_id,rec_item,relevance
0,1,"[856942, 1029743, 940947, 1070820, 5569374]","[1.7541471719741821, 1.4876700639724731, 1.381..."
1,2,"[1029743, 1106523, 5569230, 916122, 844179]","[1.7930527925491333, 1.451933741569519, 1.3498..."
2,3,"[1106523, 1029743, 1044078, 5569230, 844179]","[1.5328656435012817, 1.359671950340271, 1.3574..."
3,4,"[1029743, 1106523, 5569230, 1075368, 916122]","[1.4751536846160889, 1.1661087274551392, 1.019..."
4,5,"[1029743, 1106523, 916122, 5569230, 1126899]","[1.7350863218307495, 1.2427645921707153, 1.060..."
...,...,...,...
2492,2496,"[1029743, 1106523, 844179, 1044078, 916122]","[1.748368263244629, 1.5411715507507324, 1.4686..."
2493,2497,"[1029743, 1106523, 5569230, 899624, 5569471]","[1.7116997241973877, 1.587199091911316, 1.4717..."
2494,2498,"[1029743, 1106523, 916122, 1070820, 5569230]","[1.7717114686965942, 1.3768599033355713, 1.147..."
2495,2499,"[1029743, 1106523, 5569230, 1044078, 916122]","[1.775635838508606, 1.511269211769104, 1.39544..."


In [19]:
result = pd.merge(result, recs_all, on='user_id')

In [20]:
result

Unnamed: 0,user_id,actual,rec_item,relevance
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 1029743, 940947, 1070820, 5569374]","[1.7541471719741821, 1.4876700639724731, 1.381..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1106523, 1029743, 1044078, 5569230, 844179]","[1.5328656435012817, 1.359671950340271, 1.3574..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1026118, 1029743, 854852, 878996, 1106523]","[1.784917950630188, 1.494553565979004, 1.46915..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1029743, 1106523, 916122, 1126899, 866211]","[1.8032310009002686, 1.4123601913452148, 1.285..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1029743, 1106523, 916122, 5569230, 844179]","[1.7721980810165405, 1.4981247186660767, 1.350..."
...,...,...,...,...
2034,2496,[6534178],"[1029743, 1106523, 844179, 1044078, 916122]","[1.748368263244629, 1.5411715507507324, 1.4686..."
2035,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[1029743, 1106523, 5569230, 899624, 5569471]","[1.7116997241973877, 1.587199091911316, 1.4717..."
2036,2498,"[834484, 901776, 914190, 958382, 972437, 10398...","[1029743, 1106523, 916122, 1070820, 5569230]","[1.7717114686965942, 1.3768599033355713, 1.147..."
2037,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[1029743, 1106523, 5569230, 1044078, 916122]","[1.775635838508606, 1.511269211769104, 1.39544..."


In [21]:
print(f'map@k = {result.apply(lambda row: ap_k(row["rec_item"], row["actual"], k=5), axis=1).mean()}\n')

map@k = 0.22726826875919523



# $$map@k = 0.227$$

## Recommendation model based on algorithms, collaborative filtering for data sets with implicit feedback with performance optimization. Alternating Least Squares  using TFIDF or BM25 as a distance metric and Item-Item Nearest Neighbour models

In [22]:
%%time
data_train = prefilter_items(data, 5000, item_features)
recommender = MainRecommender(data_train)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: total: 12.3 s
Wall time: 6.49 s


### Validation and Grid Search

In [23]:
bm25_user_item_matrix = bm25_weight(recommender.user_item_matrix).tocsr()
tfidf_user_item_matrix = tfidf_weight(recommender.user_item_matrix).tocsr()

In [24]:
def grid_search(params, N=5):
    scores = []
    for factors, regularization, iterations, weight in itertools.product(*params.values()):
        
        recommender.model = AlternatingLeastSquares(factors=factors, 
                                                    regularization=regularization, 
                                                    iterations=iterations, 
                                                    calculate_training_loss=True, 
                                                    num_threads=4,
                                                    random_state=42)
        if weight == 'BM25':
            recommender.model.fit(csr_matrix(bm25_user_item_matrix).tocsr(), show_progress=True)
            result['als_bm25'] = result['user_id'].map(lambda x: recommender.get_als_recommendations(x, N))
            map_k = result.apply(lambda row: ap_k(row['als_bm25'], row['actual'], N), axis=1).mean()
        else:
            recommender.model.fit(csr_matrix(tfidf_user_item_matrix).tocsr(), show_progress=True)
            result['als_tfidf'] = result['user_id'].map(lambda x: recommender.get_als_recommendations(x, N))
            map_k = result.apply(lambda row: ap_k(row['als_tfidf'], row['actual'], N), axis=1).mean()
                
        score = {'factors':        factors,
                 'regularization': regularization,
                 'iterations':     iterations,
                 'weight':         weight,
                 'map@k':          map_k}
        print(score)
        scores.append(score)
    
    return scores

def best_params(scores, all_=False):
    df = pd.DataFrame(scores, index=range(len(scores))).sort_values('map@k', ascending=False)
    if all_:
        return df
    df = df.head(1)
    return df

### Getting the maximum score on the validation sample

In [25]:
%%time

parameters = {'factors':       [10, 50, 100, 150, 200, 350],
              'regularization':[0.05, 0.1, 0.5],
              'iterations':    [1, 5, 13, 15],
              'weight':        ['BM25', 'TFIDF']}

scores = grid_search(parameters, N=5)

  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.05, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.20652280529671369}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.05, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.19803825404610106}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.05, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2187837175085824}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.05, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.17479156449239827}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.05, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.2215792054928885}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.05, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.16434526728788615}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.05, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.2232466895537027}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.05, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.1626777832270721}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.1, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.20716037273173088}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.1, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.1979892103972536}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.1, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.21858754291319246}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.1, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.17302599313388917}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.1, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.22202059833251575}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.1, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.1615007356547327}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.1, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.22403138793526223}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.1, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.1624816086316822}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.5, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.20971064247179963}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.5, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.19303580186365854}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.5, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2211378126532611}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.5, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.1749386954389406}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.5, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.22368808239332988}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.5, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.16213830308974986}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.5, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.2214811181951935}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 10, 'regularization': 0.5, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.16267778322707208}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.05, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.2799901912702299}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.05, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.22780774889651773}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.05, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2666503187837171}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.05, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.2374693477194697}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.05, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.26081412457086767}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.05, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.23732221677292756}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.05, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.25850907307503634}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.05, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.23702795487984277}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.1, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.2821481118195186}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.1, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.22976949485041678}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.1, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.26689553702795454}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.1, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.23683178028445254}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.1, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.25978420794507073}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.1, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.23894065718489418}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.1, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.2601765571358505}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.1, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.23766552231485996}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.5, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.2825404610102985}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.5, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.231338891613536}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.5, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.26532614026483553}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.5, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.2365865620402152}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.5, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.26081412457086783}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.5, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.23751839136831743}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.5, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.26071603727317283}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 50, 'regularization': 0.5, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.23678273663560534}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.05, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.3161353604708182}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.05, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.295340853359489}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.05, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2951937224129469}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.05, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.26640510053947963}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.05, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.27611574301127934}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.05, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.2586562040215786}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.05, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.27493869543894006}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.05, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.26071603727317255}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.1, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.31539970573810605}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.1, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.29342815105443765}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.1, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2900931829328096}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.1, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.26650318783717464}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.1, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.2730259931338884}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.1, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.25983325159391796}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.1, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.2742030407062279}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.1, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.2606669936243251}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.5, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.31152525747915566}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.5, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.29244727807748816}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.5, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.28513977439921484}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.5, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.260519862677783}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.5, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.27351642962236333}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.5, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.25605689063266257}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.5, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.27179990191270176}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 100, 'regularization': 0.5, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.25772437469347664}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.05, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.31922511034820916}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.05, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.3199607650809211}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.05, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2824423737126038}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.05, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.2742520843550753}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.05, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.2693477194703282}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.05, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.2721432074546341}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.05, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.2676311917606664}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.05, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.2690534575772432}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.1, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.32618930848455024}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.1, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.32128494359980314}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.1, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2859735164296218}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.1, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.2746934771947028}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.1, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.26959293771456555}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.1, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.27140755272192196}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.1, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.2690044139283957}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.1, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.2694948504168704}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.5, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.32815105443844983}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.5, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.326679744973025}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.5, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2835213339872481}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.5, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.27145659637076924}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.5, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.27003433055419274}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.5, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.2676311917606666}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.5, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.26924963217263315}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.5, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.2652280529671402}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.05, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.327905836194212}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.05, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.319617459538989}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.05, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2689553702795482}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.05, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.27032859244727775}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.05, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.25404610102991615}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.05, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.2659637076998525}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.05, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.25262383521333936}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.05, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.2665522314860222}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.1, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.32354095144678696}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.1, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.3186365865620395}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.1, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2742520843550754}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.1, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.276164786660127}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.1, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.25899950956351103}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.1, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.2657184894556152}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.1, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.25703776360961234}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.1, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.267042667974497}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.5, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.3307994114762132}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.5, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.3152525747915642}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.5, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.27116233447768456}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.5, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.27106424717998984}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.5, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.26086316821971495}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.5, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.26115743011280007}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.5, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.25782246199117187}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 200, 'regularization': 0.5, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.2608141245708678}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.05, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.32422756253065177}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.05, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.3150564001961744}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.05, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.24678764100049033}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.05, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.2589504659146636}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.05, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.22525747915644923}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.05, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.2406571848945561}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.05, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.22358999509563499}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.05, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.2376164786660124}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.1, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.3281020107896021}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.1, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.3163315350662085}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.1, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.2458067680235408}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.1, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.2610102991662576}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.1, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.2238352133398724}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.1, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.2439431093673365}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.1, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.22241294752329566}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.1, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.2414909269249629}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.5, 'iterations': 1, 'weight': 'BM25', 'map@k': 0.32574791564492334}


  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.5, 'iterations': 1, 'weight': 'TFIDF', 'map@k': 0.3156449239823436}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.5, 'iterations': 5, 'weight': 'BM25', 'map@k': 0.23972535556645386}


  0%|          | 0/5 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.5, 'iterations': 5, 'weight': 'TFIDF', 'map@k': 0.2543894065718486}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.5, 'iterations': 13, 'weight': 'BM25', 'map@k': 0.23217263364394294}


  0%|          | 0/13 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.5, 'iterations': 13, 'weight': 'TFIDF', 'map@k': 0.23310446297204476}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.5, 'iterations': 15, 'weight': 'BM25', 'map@k': 0.2290338401177045}


  0%|          | 0/15 [00:00<?, ?it/s]

{'factors': 350, 'regularization': 0.5, 'iterations': 15, 'weight': 'TFIDF', 'map@k': 0.23246689553702746}
CPU times: total: 1h 8min 17s
Wall time: 11min 6s


In [26]:
best_params(scores, True)

Unnamed: 0,factors,regularization,iterations,weight,map@k
112,200,0.50,1,BM25,0.330799
88,150,0.50,1,BM25,0.328151
128,350,0.10,1,BM25,0.328102
96,200,0.05,1,BM25,0.327906
89,150,0.50,1,TFIDF,0.326680
...,...,...,...,...,...
7,10,0.05,15,TFIDF,0.162678
23,10,0.50,15,TFIDF,0.162678
15,10,0.10,15,TFIDF,0.162482
21,10,0.50,13,TFIDF,0.162138


In [27]:
best_score = best_params(scores)
best_score.to_csv('best_score.csv')

In [28]:
best_score = pd.read_csv('best_score.csv')
best_score.to_dict

<bound method DataFrame.to_dict of    Unnamed: 0  factors  regularization  iterations weight     map@k
0         112      200             0.5           1   BM25  0.330799>

In [29]:
data_test = data_test[data_test['user_id'].isin(data_train['user_id']) != False]
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]

In [30]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[883616, 940947, 959219, 965766, 976335, 97970..."
1,2,"[820291, 826835, 866211, 885023, 899624, 94094..."


### Мodel training on hyperparameters selected using grid search

In [31]:
%%time
recommender.model = AlternatingLeastSquares(factors=list(best_score['factors'])[0], 
                                regularization=list(best_score['regularization'])[0],
                                iterations=list(best_score['iterations'])[0],
                                calculate_training_loss=True, 
                                num_threads=4,
                                random_state=42)
if list(best_score['weight'])[0] == 'BM25':
    recommender.model.fit(csr_matrix(bm25_user_item_matrix).tocsr(), show_progress=True)
else:
    recommender.model.fit(csr_matrix(tfidf_user_item_matrix).tocsr(), show_progress=True) 

  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: total: 969 ms
Wall time: 217 ms


### Getting recommendations using a trained model

In [32]:
%%time
result['recommendations'] = result['user_id'].map(lambda x: recommender.get_als_recommendations(x, N=5))

CPU times: total: 19.7 s
Wall time: 2.99 s


### Getting the metric value for recommendations on a test dataset

In [33]:
%%time
result.apply(lambda row: ap_k(row["recommendations"], row["actual"], k=5), axis=1).mean()

CPU times: total: 312 ms
Wall time: 298 ms


0.25650452488687775

### Saving received recommendations for a test dataset

In [34]:
result.to_csv('recommendations.csv', sep=',', encoding='utf-8', header='true')

In [35]:
result

Unnamed: 0,user_id,actual,recommendations
0,1,"[883616, 940947, 959219, 965766, 976335, 97970...","[865456, 5577022, 856942, 940947, 5582712]"
1,2,"[820291, 826835, 866211, 885023, 899624, 94094...","[5569230, 1106523, 8090521, 916122, 1075368]"
2,3,"[989069, 1130858]","[1106523, 854405, 910032, 5569230, 998206]"
3,6,"[956902, 960791, 1037863, 1137688, 847738, 948...","[965267, 878996, 866211, 863447, 871611]"
4,7,"[859987, 930918, 954673, 957013, 993838, 99854...","[1122358, 1106523, 893018, 1029743, 1126899]"
...,...,...,...
1763,2494,"[880427, 894360, 1043301, 1135006, 1135476, 82...","[878996, 1029743, 5569471, 1127831, 6548453]"
1764,2496,"[829291, 912704, 933067, 933835, 979707, 99293...","[1070702, 12810393, 899624, 916122, 844179]"
1765,2498,"[920109, 1004945]","[1070820, 1106523, 835098, 1126899, 5569230]"
1766,2499,"[820321, 829291, 864857, 878996, 880150, 88230...","[5568378, 1070820, 1106523, 844179, 866211]"


# $$map@k = 0.2565$$