# Experiments with models from RecTools

Generate mock data

In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta

# Set a random seed for reproducibility
np.random.seed(42)

# Generate a DataFrame with 100 rows
data = []
start_time = pd.Timestamp.now()
time_step = timedelta(minutes=1)

for i in range(100):
    array1 = np.random.randint(-10, 10, size=(2,))
    array2 = np.random.randint(-10, 10, size=(2,))
    label = np.random.choice([-1, 1], size=(1,))[0]
    timestamp = start_time + i * time_step
    data.append([i, i+100, label, timestamp, array1.tolist(), array2.tolist()])
    # data.append([i, i+100, label, timestamp])
    # data.append([array1.tolist(), array2.tolist(), label, timestamp])

# Create the DataFrame
columns = ['User', 'Item', 'weight', 'Timestamp', 'Array1', 'Array2']
df = pd.DataFrame(data, columns=columns)
df.to_csv('../data/temp.csv')
# Display the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   User       100 non-null    int64         
 1   Item       100 non-null    int64         
 2   weight     100 non-null    int64         
 3   Timestamp  100 non-null    datetime64[ns]
 4   Array1     100 non-null    object        
 5   Array2     100 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 4.8+ KB
None


## Reading data into RecTools format

In [2]:
from rectools import Columns
from rectools.dataset import Dataset
import pandas as pd
from datetime import timedelta


quotes_labeled_reddit = pd.read_csv('../data/quotes_labeled_reddit.csv', index_col=0)
quotes_labeled_reddit = quotes_labeled_reddit.replace('\r\n','\n', regex=True)

diaries_labeled_reddit = pd.read_csv('../data/diaries_labeled_reddit.csv', index_col=0)

raw_interactions = pd.read_csv('../data/diaries_quotes_emb_twitter_interactions.csv')

In [130]:
rectools_data = []
rectools_diary_dense_features = []
rectools_quote_dense_features = []
rectools_diary_dense_fnames = None
rectools_quote_dense_fnames = None
diaries_index = set()
quotes_index = set()
start_time = pd.Timestamp.now()
time_step = timedelta(minutes=1)
for i, row in raw_interactions.iterrows():
    quote = quotes_labeled_reddit[quotes_labeled_reddit['Text'] == row['Quote']]
    quote_index = int(quote.index[0])
    quote_features = quote.drop('Text', axis=1).iloc[0].to_list()

    diary = diaries_labeled_reddit[diaries_labeled_reddit['Text'] == row['Text']]
    diary_index = int(diary.index[0])
    diary_features = diary.drop('Text', axis=1).iloc[0].to_list()

    timestamp = start_time + i * time_step
    if rectools_diary_dense_fnames is None:
        quote_fnames = quote.columns.to_list()[1:]
        diary_fnames = diary.columns.to_list()[1:]
        rectools_diary_dense_fnames = ['id'] + diary_fnames
        rectools_quote_dense_fnames = ['id'] + quote_fnames
    rectools_data.append([diary_index, quote_index, 2. if row['Interaction'] == 1 else 0.5, timestamp])
    if diary_index not in diaries_index:
        diaries_index.add(diary_index)
        rectools_diary_dense_features.append([diary_index] + diary_features)
    if quote_index not in quotes_index:
        quotes_index.add(quote_index)
        rectools_quote_dense_features.append([quote_index] + quote_features)

rectools_df = pd.DataFrame(rectools_data, columns=[*Columns.Interactions])
user_features_df = pd.DataFrame(rectools_diary_dense_features, columns=rectools_diary_dense_fnames)#.set_index('id')
item_features_df = pd.DataFrame(rectools_quote_dense_features, columns=rectools_quote_dense_fnames)#.set_index('id')
# user_features_df = user_features_df.astype('double')
# item_features_df = item_features_df.astype('double')
rectools_ds = Dataset.construct(rectools_df,
                            user_features_df=user_features_df,
                            cat_user_features=rectools_diary_dense_fnames,
                            item_features_df=item_features_df,
                            cat_item_features=rectools_quote_dense_fnames,
                            make_dense_user_features=True,
                            make_dense_item_features=True,
                            )
rectools_ds

Dataset(user_id_map=IdMap(external_ids=array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172

In [131]:
from rectools.model_selection import RandomSplitter


splitter = RandomSplitter(test_fold_frac=0.2, random_state=42, n_splits=1, filter_cold_users=False,
                    filter_cold_items=False, filter_already_seen=False)
for train_ids, test_ids, _ in splitter.split(rectools_ds.interactions):
    print(train_ids, test_ids)

[399 156 422 466 336  97 355 258  44  84 163 138  77  79 442 352 237 502
 166 499 264 185  26 256   2 254  58 395 452 465 510 348 386 497 193 259
  50 478 438 179 299 368 151 241 482 362 373 302 345 101 119 318 509 235
 518  64 305 168 426 389 171 219 244 408   8 213 506 133  38 416 476 271
 222  82  86 374 400 413 331 402 433 104 382 147 380 211 340 464 260 412
 321 231 455 397  24 269 414 484 289 337 379  68 198 306  56 322 204 197
  19 162 440 225 232  30  60 435  73 294   3 358 517 430 446 424 288 245
 343 118 341 310 514 126 471  47 253 200 127 243 146 262 265  27 411  62
 445  36 481 344 363 247 371 172 291 228 273 143 268 441 210 238  89 293
 148 196 409 338 134  12 285  67 396 229 201 516   7 319 515 292  25  92
 233 428 181  75 250 377 165 406 487 149 367  70 255 324 418  54 500 169
 327 215 474 480 158 136 489  95 508 353 186 263 257 403 357 472 356  88
 404 493 248 214  37 417 270 390 276 161 216 207 354 173 112  18 153  11
 448 205 335 384 459  66   9 425  59 450 457   4  9

In [132]:
rectools_df_train = rectools_df.iloc[train_ids]
train_user_ids = rectools_df_train[Columns.User].to_list()
train_item_ids = rectools_df_train[Columns.Item].to_list()
user_features_df_train = user_features_df.loc[user_features_df['id'].isin(train_user_ids)]
item_features_df_train = item_features_df.loc[item_features_df['id'].isin(train_item_ids)]
rectools_ds_train = Dataset.construct(rectools_df_train,
                            user_features_df=user_features_df_train,
                            cat_user_features=rectools_diary_dense_fnames,
                            item_features_df=item_features_df_train,
                            cat_item_features=rectools_quote_dense_fnames,
                            make_dense_user_features=True,
                            make_dense_item_features=True,
                            )
rectools_ds_train

Dataset(user_id_map=IdMap(external_ids=array([399, 156, 422, 466, 336,  97, 355, 258,  44,  84, 163, 138,  77,
        79, 442, 352, 237, 502, 166, 499, 264, 185,  26, 256,   2, 254,
        58, 395, 452, 465, 510, 348, 386, 497, 193, 259,  50, 478, 438,
       179, 299, 368, 151, 241, 482, 362, 373, 302, 345, 101, 119, 318,
       509, 235, 518,  64, 305, 168, 426, 389, 171, 219, 244, 408,   8,
       213, 506, 133,  38, 416, 476, 271, 222,  82,  86, 374, 400, 413,
       331, 402, 433, 104, 382, 147, 380, 211, 340, 464, 260, 412, 321,
       231, 455, 397,  24, 269, 414, 484, 289, 337, 379,  68, 198, 306,
        56, 322, 204, 197,  19, 162, 440, 225, 232,  30,  60, 435,  73,
       294,   3, 358, 517, 430, 446, 424, 288, 245, 343, 118, 341, 310,
       514, 126, 471,  47, 253, 200, 127, 243, 146, 262, 265,  27, 411,
        62, 445,  36, 481, 344, 363, 247, 371, 172, 291, 228, 273, 143,
       268, 441, 210, 238,  89, 293, 148, 196, 409, 338, 134,  12, 285,
        67, 396, 229, 201

In [133]:
rectools_df_test = rectools_df.iloc[test_ids]
test_user_ids = rectools_df_test[Columns.User].to_list()
test_item_ids = rectools_df_test[Columns.Item].to_list()
user_features_df_test = user_features_df.loc[user_features_df['id'].isin(test_user_ids)]
item_features_df_test = item_features_df.loc[item_features_df['id'].isin(test_item_ids)]
rectools_ds_test = Dataset.construct(rectools_df_test,
                            user_features_df=user_features_df_test,
                            cat_user_features=rectools_diary_dense_fnames,
                            item_features_df=item_features_df_test,
                            cat_item_features=rectools_quote_dense_fnames,
                            make_dense_user_features=True,
                            make_dense_item_features=True,
                            )
rectools_ds_test

Dataset(user_id_map=IdMap(external_ids=array([494, 159, 378, 462,  51, 183, 284, 507,  72, 326,  55, 239, 444,
        83, 485,  53, 145,  29, 242,  46, 495, 174, 144, 283,  45, 330,
       249, 115, 176, 203, 124,  98, 351, 295, 234,  91, 164, 401, 461,
       202, 154, 184,   0, 217, 280, 477, 312, 383, 261,  40, 150,  80,
       456, 504, 132, 236, 334,  20, 394, 180, 421, 346,  96, 192, 281,
       252, 246, 381, 513, 191, 369, 120,  33, 230, 226, 178,  61, 447,
       360, 320, 108, 194, 107, 304, 131, 391, 315, 460, 496, 463, 392,
       152, 475, 503, 100, 307, 298, 282, 453, 365, 135, 415, 393, 140])), item_id_map=IdMap(external_ids=array([558, 321, 185, 531,  45, 335,  16, 773, 682,  25, 240, 555, 340,
       827, 599, 767, 469, 816, 409, 303, 486, 316, 161, 499, 471, 238,
       556, 455, 844, 454, 656, 199, 137,  26, 572, 557, 107, 224, 460,
       760, 441, 150, 169, 352, 797,  77, 328, 691, 135, 413, 306, 151,
       740, 403, 817, 243, 178, 241, 766, 735, 277, 565, 214, 5

## Fit-predict

In [178]:
from rectools.models import PopularModel

model = PopularModel()
model.fit(rectools_ds_train)
recos = model.recommend(
    users=rectools_df_train[Columns.User].unique(),
    dataset=rectools_ds_train,
    k=10,
    filter_viewed=True,
)

In [179]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,399,306,10.0,1
1,399,239,7.0,2
2,399,185,6.0,3
3,399,827,6.0,4
4,399,156,6.0,5
...,...,...,...,...
4145,141,354,6.0,6
4146,141,268,6.0,7
4147,141,154,6.0,8
4148,141,15,5.0,9


In [180]:
from rectools.models import RandomModel


model_rnd = RandomModel()
model_rnd.fit(rectools_ds_train)
recos_rnd = model_rnd.recommend(
    users=rectools_df_train[Columns.User].unique(),
    dataset=rectools_ds_train,
    k=10,
    filter_viewed=True,
)

In [181]:
recos_rnd

Unnamed: 0,user_id,item_id,score,rank
0,399,118,10,1
1,399,117,9,2
2,399,449,8,3
3,399,826,7,4
4,399,251,6,5
...,...,...,...,...
4145,141,79,5,6
4146,141,639,4,7
4147,141,730,3,8
4148,141,460,2,9


In [182]:
from rectools.models import PureSVDModel


model_svd = PureSVDModel()
model_svd.fit(rectools_ds_train)
recos_svd = model.recommend(
    users=rectools_df_train[Columns.User].unique(),
    dataset=rectools_ds_train,
    k=10,
    filter_viewed=True,
)

In [183]:
recos_svd

Unnamed: 0,user_id,item_id,score,rank
0,399,306,10.0,1
1,399,239,7.0,2
2,399,185,6.0,3
3,399,827,6.0,4
4,399,156,6.0,5
...,...,...,...,...
4145,141,354,6.0,6
4146,141,268,6.0,7
4147,141,154,6.0,8
4148,141,15,5.0,9


In [184]:
from rectools.models import ImplicitALSWrapperModel
from implicit.als import AlternatingLeastSquares


model_als = ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=64,
            regularization=0.01,
            alpha=1,
            random_state=2023,
            use_gpu=False,
            iterations=15
            )
        )
model_als.fit(rectools_ds_train)
recos_als = model_als.recommend(
    users=rectools_df_train[Columns.User].unique(),
    dataset=rectools_ds_train,
    k=10,
    filter_viewed=True,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [185]:
recos_als

Unnamed: 0,user_id,item_id,score,rank
0,399,321,0.094171,1
1,399,306,0.086694,2
2,399,26,0.043213,3
3,399,107,0.042127,4
4,399,75,0.039770,5
...,...,...,...,...
4145,141,730,0.170825,6
4146,141,352,0.156506,7
4147,141,231,0.143746,8
4148,141,572,0.134641,9


In [141]:
from rectools.models import LightFMWrapperModel
from lightfm import LightFM


model_lfm = LightFMWrapperModel(
        model=LightFM(no_components = 30)
        )
model_lfm.fit(rectools_ds_train)
recos_lfm = model_lfm.recommend(
    users=rectools_df_train[Columns.User].unique(),
    dataset=rectools_ds_train,
    k=10,
    filter_viewed=True,
)

In [142]:
recos_lfm

Unnamed: 0,user_id,item_id,score,rank
0,399,354,1.705025,1
1,399,352,1.694038,2
2,399,732,1.688239,3
3,399,238,1.684359,4
4,399,120,1.674429,5
...,...,...,...,...
4145,141,794,1.640827,6
4146,141,57,1.640790,7
4147,141,440,1.634644,8
4148,141,650,1.634181,9


## Evaluation

In [188]:
from rectools.metrics import NDCG, Accuracy, IntraListDiversity, MAP, MCC, MRR, MeanInvUserFreq, Precision, Recall, Serendipity
from rectools.metrics.distances import PairwiseHammingDistanceCalculator
from sklearn.model_selection import train_test_split


k = 10

ndcg = NDCG(k=k, log_base=3)
acc = Accuracy(k=k)
mmap = MAP(k=k)
ild = IntraListDiversity(k=k, distance_calculator=PairwiseHammingDistanceCalculator(item_features_df_test))
mcc = MCC(k=k)
mrr = MRR(k=k)
miuf = MeanInvUserFreq(k=k)
prc = Precision(k=k)
rec = Recall(k=k)
srd = Serendipity(k=k)

interactions, prev_interactions = train_test_split(rectools_df_test[[Columns.User, Columns.Item]], train_size=0.2, random_state=42)  # Assumption
reco = recos_lfm[[Columns.User, Columns.Item, Columns.Rank]]
# interactions = rectools_df_test[[Columns.User, Columns.Item]]
# prev_interactions = rectools_df_test[[Columns.User, Columns.Item]]  # TODO: define
catalog = item_features_df_test['id'].unique()  # Collection of unique item ids that could be used for recommendations  

In [189]:
print("NDCG: ", ndcg.calc(reco=reco, interactions=interactions))
print("Accuracy: ", acc.calc(reco=reco, interactions=interactions, catalog=catalog))
print("IntraListDiversity: ", ild.calc(reco=reco))
print('MAP: ', mmap.calc(reco=reco, interactions=interactions))
print('MCC: ', mcc.calc(reco=reco, interactions=interactions, catalog=catalog))
print('MRR: ', mrr.calc(reco=reco, interactions=interactions))
print('MeanInvUserFreq: ', miuf.calc(reco=reco, prev_interactions=prev_interactions))
print('Precision: ', prc.calc(reco=reco, interactions=interactions))
print('Recall: ', rec.calc(reco=reco, interactions=interactions))
print('Serendipity: ', srd.calc(reco=reco, interactions=interactions, prev_interactions=prev_interactions, catalog=catalog))

NDCG:  0.0
Accuracy:  0.8720930232558141
IntraListDiversity:  29.0
MAP:  0.0
MCC:  -0.039344473768231684
MRR:  0.0
MeanInvUserFreq:  6.136453657647274
Precision:  0.0
Recall:  0.0
Serendipity:  0.0


