In [1]:
import pandas as pd
import csv

# correzione dell'ordine di stampa
import functools
from operator import rshift
print = functools.partial(print, flush=True)

# import dei moduli per Content Analyzer, Recommender System e Evaluation come librerie
from clayrs import content_analyzer as ca
from clayrs import recsys as rs
from clayrs import evaluation as eva

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# path del dataset
path = 'C:/Users/glamo/Desktop/Repository/RecSys-Algorithms-Evaluation/Dataset/Movielens 100k/'

# apertura del file contenente i film
items = open(path + 'items_info.json')

# apertura del file con i ratings
ratings = open(path + 'ratings.csv')

In [3]:
# configurazione del content analyzer
ca_config = ca.ItemAnalyzerConfig(
    source = ca.JSONFile(path + 'items_info.json'),
    id = 'movielens_id',
    output_directory = path + 'movies_codified/'
)

In [4]:
# inserimento delle rappresentazioni multiple
ca_config.add_multiple_config(
    'plot',
    [
        ca.FieldConfig(ca.SkLearnTfIdf(),
                       preprocessing=ca.NLTK(stopwords_removal=True, lemmatization=True),
                       id='tfidf'),
    
        ca.FieldConfig(ca.Word2DocEmbedding(ca.Gensim('glove-twitter-50'), combining_technique=ca.Centroid()),
                       ca.NLTK(stopwords_removal=True, lemmatization=True),
                       id='gensim')
    ]
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\glamo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-d

In [5]:
# serializzazione degli item
ca.ContentAnalyzer(config = ca_config).fit()

[39mINFO[0m - ***********   Processing field: plot   *********** (content_analyzer_main.py:188)
[39mINFO[0m - Computing tf-idf with SkLearnTfIdf (tf_idf.py:92)
[39mINFO[0m - Downloading/Loading Gensim glove-twitter-50 (gensim.py:31)
Processing and producing contents with Gensim glove-twitter-50:  100%|████████████████████████| 1682/1682 [00:20<00:00]
Serializing contents:  100%|██████████████████████████████████████████████████████████████████| 1682/1682 [00:08<00:00]


In [6]:
# Recommender: Centroid Vector Algorithm
ratings = ca.Ratings(ca.CSVFile(path + 'ratings.csv'))

centroid_vector = rs.CentroidVector({'plot': 'tfidf'},
                                    similarity = rs.CosineSimilarity())                                    

Importing ratings:  100%|█████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00]


In [7]:
# Split Test Set e Training Set
train_list, test_list = rs.KFoldPartitioning(n_splits=2).split_all(ratings)
                        #s.KFoldPartitioning(n_splits=2).split_all(ratings)

first_training_set = train_list[0]

cbrs = rs.ContentBasedRS(centroid_vector, 
                         first_training_set, 
                         (path + '/movies_codified'))

first_test_set = test_list[0]

rank = cbrs.fit_rank(first_test_set, 
                     user_id_list = ['8', '2', '1'],
                     n_recs = 3)

print(rank)

Performing KFoldPartitioningTechnique:  100%|███████████████████████████████████████████████████| 943/943 [00:00<00:00]
[39mINFO[0m - Don't worry if it looks stuck at first (recsys.py:349)
[39mINFO[0m - First iterations will stabilize the estimated remaining time (recsys.py:350)
Computing fit_rank for user 1:  100%|███████████████████████████████████████████████████████████████| 3/3 [00:00<00:00]


  user_id item_id     score
0       2     297  0.240214
1       2     316  0.070412
2       2     291  0.056568
3       8      50  0.126758
4       8     511  0.100450
5       8     385  0.093093
6       1      45  0.129885
7       1      65  0.115058
8       1     103  0.113256


In [9]:
result_list = []

for train_set, test_set in zip(train_list, test_list):
    cbrs = rs.ContentBasedRS(centroid_vector, train_set, (path + '/movies_codified'))
    rank_to_append = cbrs.fit_rank(test_set)
    result_list.append(rank_to_append)

[39mINFO[0m - Don't worry if it looks stuck at first (recsys.py:349)
[39mINFO[0m - First iterations will stabilize the estimated remaining time (recsys.py:350)
Computing fit_rank for user 774:  100%|█████████████████████████████████████████████████████████| 943/943 [00:17<00:00]
[39mINFO[0m - Don't worry if it looks stuck at first (recsys.py:349)
[39mINFO[0m - First iterations will stabilize the estimated remaining time (recsys.py:350)
Computing fit_rank for user 774:  100%|█████████████████████████████████████████████████████████| 943/943 [00:18<00:00]


In [10]:
print(result_list)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

