In [1]:
import pandas as pd
import csv

# correzione dell'ordine di stampa
import functools
from operator import rshift
print = functools.partial(print, flush=True)

# import dei moduli per Content Analyzer, Recommender System e Evaluation come librerie
from clayrs import content_analyzer as ca
from clayrs import recsys as rs
from clayrs import evaluation as eva

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# path del dataset
path = 'C:/Users/glamo/Desktop/Repository/RecSys-Algorithms-Evaluation/Dataset/Movielens 100k/'

serialized_items_path = 'C:/Users/glamo/Desktop/Repository/RecSys-Algorithms-Evaluation/Test/movies_codified/'

ranks_path = 'C:/Users/glamo/Desktop/Repository/RecSys-Algorithms-Evaluation/Test/Ranks/'

# apertura del file contenente i film
items = open(path + 'items_info.json')

# apertura del file con i ratings
ratings = open(path + 'ratings.csv')

In [4]:
# configurazione del content analyzer
ca_config = ca.ItemAnalyzerConfig(
    source = ca.JSONFile(path + 'items_info.json'),
    id = 'movielens_id',
    output_directory = serialized_items_path
)

In [5]:
# inserimento delle rappresentazioni multiple
ca_config.add_single_config(
    'plot',
        ca.FieldConfig(ca.DocumentEmbeddingTechnique(ca.GensimLatentSemanticAnalysis()),
                       preprocessing=ca.NLTK(stopwords_removal=True, lemmatization=True),
                       id='GensimLatentSemanticAnalysis')
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-d

In [6]:
# serializzazione degli items
ca.ContentAnalyzer(config = ca_config).fit()

[39mINFO[0m - ***********   Processing field: plot   *********** (content_analyzer_main.py:188)
Preprocessing plot for all contents:  100%|███████████████████████████████████████████████████| 1682/1682 [00:22<00:00]
[39mINFO[0m - Fitting model with extracted corpus... (embedding_learner.py:98)
Processing and producing contents with GensimLatentSemanticAnalysis:  100%|███████████████████| 1682/1682 [00:21<00:00]
Serializing contents:  100%|██████████████████████████████████████████████████████████████████| 1682/1682 [00:06<00:00]


In [7]:
def predict(algorithm, training_set, test_set, serialized_items_path):
    cbrs = rs.ContentBasedRS(algorithm, training_set, serialized_items_path)
    result_ranking = cbrs.fit_rank(test_set, n_recs = 10)
    
    return result_ranking

In [9]:
# caricamento ratings
ratings = ca.Ratings(ca.CSVFile(path + 'ratings.csv'))

# split del dataset in training e test list
train_list, test_list = rs.HoldOutPartitioning(train_set_size=0.8).split_all(ratings)

# l'Hold Out restituisce un singolo training set e test set
training_set = train_list[0]
test_set = test_list[0]

Importing ratings:  100%|█████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00]
Performing HoldOutPartitioningTechnique:  100%|█████████████████████████████████████████████████| 943/943 [00:00<00:00]


In [10]:
# Recommender: Centroid Vector Algorithm
centroid_vector = rs.CentroidVector(
    {'plot': 'GensimLatentSemanticAnalysis'},
    similarity = rs.CosineSimilarity()
)

ranking = predict(centroid_vector, training_set, test_set, serialized_items_path)
ranking.to_csv(ranks_path, 'GensimLatentSemanticAnalysis + Centroid Vector')

[39mINFO[0m - Don't worry if it looks stuck at first (recsys.py:349)
[39mINFO[0m - First iterations will stabilize the estimated remaining time (recsys.py:350)
Computing fit_rank for user 870:  100%|█████████████████████████████████████████████████████████| 943/943 [00:05<00:00]


In [11]:
# Recommender: Logistic Regression

logistic_regression = rs.ClassifierRecommender(
    {'plot': 'GensimLatentSemanticAnalysis'},
    rs.SkLogisticRegression()
)

rankings = predict(logistic_regression, training_set, test_set, serialized_items_path)
ranking.to_csv(ranks_path, 'GensimLatentSemanticAnalysis + Logistic Regression')

[39mINFO[0m - Don't worry if it looks stuck at first (recsys.py:349)
[39mINFO[0m - First iterations will stabilize the estimated remaining time (recsys.py:350)
Computing fit_rank for user 870:  100%|█████████████████████████████████████████████████████████| 943/943 [00:10<00:00]


In [22]:
def evaluate(test_set, rankings, ratings, cutoff, filename):
    user_groups = {'Blockbuster': 0.2, 'Niche': 0.2, 'Diverse': 0.6}
    catalog = set(ratings.item_id_column)
    eval_path = 'C:/Users/glamo/Desktop/Repository/RecSys-Algorithms-Evaluation/Test/Eval/'
    
    em = eva.EvalModel(
        rankings,
        test_set,
        metric_list=[
            eva.PrecisionAtK(k=cutoff),
            eva.RecallAtK(k=cutoff),
            eva.FMeasureAtK(k=cutoff),
            eva.NDCGAtK(k=cutoff),
            eva.MRRAtK(k=cutoff),
            eva.GiniIndex(top_n=cutoff),
            eva.CatalogCoverage(catalog, top_n=cutoff),
            eva.DeltaGap(user_groups, top_n=cutoff)
        ]
    )
    
    sys_result, users_result = em.fit()
    sys_result.to_csv(eval_path + 'SYS - ' + filename)
    users_result.to_csv(eval_path + 'SYS - ' + filename)

In [24]:
import os
import pandas as pd

for filename in os.listdir(ranks_path):
    rank_list = []
    rank_list.append(ca.Rank(ca.CSVFile(ranks_path + filename)))
    #print(len(ranks))
    print(len(test_list))
    evaluate(test_list, rank_list, ratings, 10, filename)

Importing ratings:  100%|█████████████████████████████████████████████████████████████████████| 8134/8134 [00:00<00:00]

1



[39mINFO[0m - Performing evaluation on metrics chosen (eval_model.py:89)
Performing DeltaGap - Top 10:  100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00]
Importing ratings:  100%|█████████████████████████████████████████████████████████████████████| 8134/8134 [00:00<00:00]

1



[39mINFO[0m - Performing evaluation on metrics chosen (eval_model.py:89)
Performing DeltaGap - Top 10:  100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00]
Importing ratings:  100%|█████████████████████████████████████████████████████████████████████| 8134/8134 [00:00<00:00]

1



[39mINFO[0m - Performing evaluation on metrics chosen (eval_model.py:89)
Performing DeltaGap - Top 10:  100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00]
Importing ratings:  100%|█████████████████████████████████████████████████████████████████████| 8134/8134 [00:00<00:00]

1



[39mINFO[0m - Performing evaluation on metrics chosen (eval_model.py:89)
Performing DeltaGap - Top 10:  100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00]


In [32]:
# T-Test
rank_list = []
eval_path = 'C:/Users/glamo/Desktop/Repository/RecSys-Algorithms-Evaluation/Test/Eval/'
for filename in os.listdir(ranks_path):
    rank_list.append(pd.read_csv(ranks_path + filename))
    
result_ttest = eva.Ttest().perform(rank_list)
result_ttest.to_csv(eval_path + "T-Test.csv")


result_wilcoxon = eva.Wilcoxon().perform(rank_list)

print(result_wilcoxon)
result_wilcoxon.to_csv(eval_path + "Wilcoxon.csv")

KeyError: "None of ['Systems evaluated'] are in the columns"