## Imports and downloading data

In [None]:
!pip install caserecommender

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting caserecommender
  Downloading CaseRecommender-1.1.1-py2.py3-none-any.whl (103 kB)
[K     |████████████████████████████████| 103 kB 7.5 MB/s 
Installing collected packages: caserecommender
Successfully installed caserecommender-1.1.1


In [None]:
import nltk
import json
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from google.colab import files, drive
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS

# Recommender algorithms
from caserec.recommenders.rating_prediction.svd import SVD
from caserec.recommenders.rating_prediction.userknn import UserKNN
from caserec.recommenders.rating_prediction.itemknn import ItemKNN
from caserec.recommenders.rating_prediction.random_rec import RandomRec
from caserec.recommenders.rating_prediction.most_popular import MostPopular
from caserec.recommenders.rating_prediction.item_attribute_knn import ItemAttributeKNN
from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

rm: cannot remove '/root/.kaggle': No such file or directory


In [None]:
!kaggle competitions download -c scc5966
!mkdir competition
!mkdir train_outputs
!unzip scc5966.zip -d competition/

Downloading scc5966.zip to /content
 29% 5.00M/17.5M [00:00<00:00, 37.0MB/s]
100% 17.5M/17.5M [00:00<00:00, 76.7MB/s]
Archive:  scc5966.zip
  inflating: competition/movie_reviews.csv  
  inflating: competition/movies_data.csv  
  inflating: competition/test_data.csv  
  inflating: competition/train_data .csv  
  inflating: competition/users_data.csv  


## Reading and preparing data

In [None]:
df_users = pd.read_csv('competition/users_data.csv')
df_train = pd.read_csv('competition/train_data .csv')
df_test = pd.read_csv('competition/test_data.csv')

df_train.drop(columns=['timestamp'], inplace=True)
df_test.drop(columns=['timestamp'], inplace=True)

df_train.to_csv('full_train.dat', index=False, header=False, sep='\t')
df_test[['user_id', 'movie_id', 'id']].to_csv('full_test.dat', index=False, header=False, sep='\t')

df_test.set_index(['user_id', 'movie_id'], inplace=True)

In [None]:
train = df_train.sample(frac=.1, random_state=3)

train, eval = train_test_split(train,
                               test_size=.3,
                               random_state=7)

train.to_csv('train.dat', index=False, header=False, sep='\t')
eval.to_csv('test.dat', index=False, header=False, sep='\t')

eval = eval.reset_index()
eval.set_index(['user_id', 'movie_id'], inplace=True)

# Prediction Algorithms

## Cold Starting

In [None]:
def cold_start(results, train_set, target_key, cold_key, method='average'):
    cold_group = train_set.groupby(cold_key)

    if method == 'average': colded = cold_group.mean()
    elif method == 'mode': colded = cold_group.agg( lambda x: pd.Series.mode(x)[0] )
    else: colded = cold_group.agg( lambda x: np.random.randint(0, 51) / 10 ) # Random

    colded = colded[['rating']].rename(columns={'rating': 'colded'})
    
    results = results.join(colded, how='left', on=cold_key)

    results.rating = np.where(results.rating.isna(), results.colded, results.rating)
    results = results.drop(columns=['colded'])
    
    return results

## Baseline

In [None]:
def baseline_predict(train, test, global_bias, user_const, item_const):

    print(f"\n\nStarting training on baseline with {user_const} user bias and {item_const} item bias")

    train_output = f'train_outputs/basline_{user_const}_userConstant_{item_const}_itemConstant.dat'
    result_output = f'result_outputs/results_{train_output}'

    users_bias = train.groupby('user_id').agg({'rating': ['sum', 'count']})
    users_bias['user_bias'] = (users_bias[('rating', 'sum')]) / (user_const + users_bias[('rating', 'count')])
    users_bias = users_bias[['user_bias']]

    movies_bias = train.groupby('movie_id').agg({'rating': ['sum', 'count']})
    movies_bias['movie_bias'] = (movies_bias[('rating', 'sum')]) / (item_const + movies_bias[('rating', 'count')])
    movies_bias = movies_bias[['movie_bias']]

    results = test.join(users_bias, how='left') \
                        .join(movies_bias, how='left')

    results.fillna(0, inplace=True)
    results.columns = ['id', 'user_bias', 'movie_bias']
    results['rating'] = global_bias + results.user_bias + results.movie_bias

    return results

def baseline_run_tests(train, test, consts):
    test_results_file = 'test_results_Baseline.csv'

    base_eval = test.copy()
    base_train = train.copy()

    global_bias = train.rating.mean()
    base_train.rating = base_train.rating - global_bias
    
    real = base_eval.reset_index().rating
    base_eval.drop(columns=['rating'], inplace=True)

    for u_const in consts:
        for i_const in consts:
            preds = baseline_predict(base_train, base_eval, global_bias, u_const, i_const)
            pred = preds.reset_index().rating

            rmse = mean_squared_error(real, pred, squared=False)
            print(f'RMSE for baseline method constants {u_const} and {i_const}: {rmse}')

            with open(test_results_file, 'a') as f:
                data = f"baseline,{u_const},{i_const},{rmse}\n"
                f.write(data)

    files.download(test_results_file)

In [None]:
constants = [1, 2, 5, 10, 15, 20, 25, 50, 100, 200]
baseline_run_tests(train, eval, constants)



Starting training on baseline with 1 user bias and 1 item bias
RMSE for baseline method constants 1 and 1: 1.002819020538464


Starting training on baseline with 1 user bias and 2 item bias
RMSE for baseline method constants 1 and 2: 0.9983422290990647


Starting training on baseline with 1 user bias and 5 item bias
RMSE for baseline method constants 1 and 5: 0.9959905129978041


Starting training on baseline with 1 user bias and 10 item bias
RMSE for baseline method constants 1 and 10: 0.999105107319214


Starting training on baseline with 1 user bias and 15 item bias
RMSE for baseline method constants 1 and 15: 1.0032184683328935


Starting training on baseline with 1 user bias and 20 item bias
RMSE for baseline method constants 1 and 20: 1.0070855186247278


Starting training on baseline with 1 user bias and 25 item bias
RMSE for baseline method constants 1 and 25: 1.0105524453041606


Starting training on baseline with 1 user bias and 50 item bias
RMSE for baseline method constan

  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,




Starting training on baseline with 1 user bias and 100 item bias
RMSE for baseline method constants 1 and 100: 1.0367258641887767


Starting training on baseline with 1 user bias and 200 item bias
RMSE for baseline method constants 1 and 200: 1.0489321550622592


Starting training on baseline with 2 user bias and 1 item bias
RMSE for baseline method constants 2 and 1: 0.9925800060938095


Starting training on baseline with 2 user bias and 2 item bias
RMSE for baseline method constants 2 and 2: 0.9882057002110461


Starting training on baseline with 2 user bias and 5 item bias
RMSE for baseline method constants 2 and 5: 0.9861279344067141


Starting training on baseline with 2 user bias and 10 item bias
RMSE for baseline method constants 2 and 10: 0.989580961457939


Starting training on baseline with 2 user bias and 15 item bias


  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,


RMSE for baseline method constants 2 and 15: 0.9939430917603305


Starting training on baseline with 2 user bias and 20 item bias
RMSE for baseline method constants 2 and 20: 0.9980050054331797


Starting training on baseline with 2 user bias and 25 item bias
RMSE for baseline method constants 2 and 25: 1.001630902165935


Starting training on baseline with 2 user bias and 50 item bias
RMSE for baseline method constants 2 and 50: 1.0147476345483772


Starting training on baseline with 2 user bias and 100 item bias
RMSE for baseline method constants 2 and 100: 1.0287990719576743


Starting training on baseline with 2 user bias and 200 item bias
RMSE for baseline method constants 2 and 200: 1.0413952241939555


Starting training on baseline with 5 user bias and 1 item bias
RMSE for baseline method constants 5 and 1: 0.9812322872934646


Starting training on baseline with 5 user bias and 2 item bias
RMSE for baseline method constants 5 and 2: 0.9771369522561877


Starting training on base

  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,


RMSE for baseline method constants 5 and 5: 0.9756793145491701


Starting training on baseline with 5 user bias and 10 item bias
RMSE for baseline method constants 5 and 10: 0.9798089370564625


Starting training on baseline with 5 user bias and 15 item bias
RMSE for baseline method constants 5 and 15: 0.9846359539608728


Starting training on baseline with 5 user bias and 20 item bias
RMSE for baseline method constants 5 and 20: 0.9890486240620275


Starting training on baseline with 5 user bias and 25 item bias
RMSE for baseline method constants 5 and 25: 0.9929535281776686


Starting training on baseline with 5 user bias and 50 item bias
RMSE for baseline method constants 5 and 50: 1.0069338277551327


Starting training on baseline with 5 user bias and 100 item bias
RMSE for baseline method constants 5 and 100: 1.0217598017835328


Starting training on baseline with 5 user bias and 200 item bias
RMSE for baseline method constants 5 and 200: 1.0349589080189197


Starting training on 

  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,


RMSE for baseline method constants 10 and 1: 0.9771784565358801


Starting training on baseline with 10 user bias and 2 item bias
RMSE for baseline method constants 10 and 2: 0.9734251883659387


Starting training on baseline with 10 user bias and 5 item bias
RMSE for baseline method constants 10 and 5: 0.9726442730300112


Starting training on baseline with 10 user bias and 10 item bias
RMSE for baseline method constants 10 and 10: 0.9774389250108987


Starting training on baseline with 10 user bias and 15 item bias
RMSE for baseline method constants 10 and 15: 0.982692347137965


Starting training on baseline with 10 user bias and 20 item bias
RMSE for baseline method constants 10 and 20: 0.9874132379787802


Starting training on baseline with 10 user bias and 25 item bias
RMSE for baseline method constants 10 and 25: 0.9915558749381392


Starting training on baseline with 10 user bias and 50 item bias
RMSE for baseline method constants 10 and 50: 1.00623481623983


Starting training

  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,


RMSE for baseline method constants 10 and 200: 1.0352743481011466


Starting training on baseline with 15 user bias and 1 item bias
RMSE for baseline method constants 15 and 1: 0.9773099750468829


Starting training on baseline with 15 user bias and 2 item bias
RMSE for baseline method constants 15 and 2: 0.9738034064082169


Starting training on baseline with 15 user bias and 5 item bias
RMSE for baseline method constants 15 and 5: 0.9734818873294816


Starting training on baseline with 15 user bias and 10 item bias
RMSE for baseline method constants 15 and 10: 0.9786994641756901


Starting training on baseline with 15 user bias and 15 item bias
RMSE for baseline method constants 15 and 15: 0.9842108713058442


Starting training on baseline with 15 user bias and 20 item bias
RMSE for baseline method constants 15 and 20: 0.9891120451428503


Starting training on baseline with 15 user bias and 25 item bias
RMSE for baseline method constants 15 and 25: 0.9933902025432068


Starting train

  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,


RMSE for baseline method constants 15 and 50: 1.0084494596009654


Starting training on baseline with 15 user bias and 100 item bias
RMSE for baseline method constants 15 and 100: 1.0241567395195248


Starting training on baseline with 15 user bias and 200 item bias
RMSE for baseline method constants 15 and 200: 1.0379917097580609


Starting training on baseline with 20 user bias and 1 item bias
RMSE for baseline method constants 20 and 1: 0.9785534559271822


Starting training on baseline with 20 user bias and 2 item bias
RMSE for baseline method constants 20 and 2: 0.9752364146175062


Starting training on baseline with 20 user bias and 5 item bias
RMSE for baseline method constants 20 and 5: 0.975256515908483


Starting training on baseline with 20 user bias and 10 item bias
RMSE for baseline method constants 20 and 10: 0.9807765943245486


Starting training on baseline with 20 user bias and 15 item bias
RMSE for baseline method constants 20 and 15: 0.9864666517561523


Starting tra

  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,


RMSE for baseline method constants 20 and 20: 0.991489794560731


Starting training on baseline with 20 user bias and 25 item bias
RMSE for baseline method constants 20 and 25: 0.9958579562982415


Starting training on baseline with 20 user bias and 50 item bias
RMSE for baseline method constants 20 and 50: 1.0111611321380474


Starting training on baseline with 20 user bias and 100 item bias
RMSE for baseline method constants 20 and 100: 1.0270483758611397


Starting training on baseline with 20 user bias and 200 item bias
RMSE for baseline method constants 20 and 200: 1.0410012030545894


Starting training on baseline with 25 user bias and 1 item bias
RMSE for baseline method constants 25 and 1: 0.9801230052794078


Starting training on baseline with 25 user bias and 2 item bias
RMSE for baseline method constants 25 and 2: 0.9769577312752866


Starting training on baseline with 25 user bias and 5 item bias
RMSE for baseline method constants 25 and 5: 0.977245567186109


Starting trai

  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,


RMSE for baseline method constants 25 and 15: 0.98881943430633


Starting training on baseline with 25 user bias and 20 item bias
RMSE for baseline method constants 25 and 20: 0.9939317845686862


Starting training on baseline with 25 user bias and 25 item bias
RMSE for baseline method constants 25 and 25: 0.9983648146511862


Starting training on baseline with 25 user bias and 50 item bias
RMSE for baseline method constants 25 and 50: 1.0138387846177053


Starting training on baseline with 25 user bias and 100 item bias
RMSE for baseline method constants 25 and 100: 1.0298458027610682


Starting training on baseline with 25 user bias and 200 item bias
RMSE for baseline method constants 25 and 200: 1.0438728300174078


Starting training on baseline with 50 user bias and 1 item bias
RMSE for baseline method constants 50 and 1: 0.9875588345946742


Starting training on baseline with 50 user bias and 2 item bias
RMSE for baseline method constants 50 and 2: 0.984861427954004


Starting tra

  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,


RMSE for baseline method constants 50 and 10: 0.992356948694215


Starting training on baseline with 50 user bias and 15 item bias
RMSE for baseline method constants 50 and 15: 0.9985443895773795


Starting training on baseline with 50 user bias and 20 item bias
RMSE for baseline method constants 50 and 20: 1.0038933324065276


Starting training on baseline with 50 user bias and 25 item bias
RMSE for baseline method constants 50 and 25: 1.0084937501013398


Starting training on baseline with 50 user bias and 50 item bias
RMSE for baseline method constants 50 and 50: 1.0243840642634292


Starting training on baseline with 50 user bias and 100 item bias
RMSE for baseline method constants 50 and 100: 1.0406520765767207


Starting training on baseline with 50 user bias and 200 item bias
RMSE for baseline method constants 50 and 200: 1.0548187986117012


Starting training on baseline with 100 user bias and 1 item bias
RMSE for baseline method constants 100 and 1: 0.9966650321027974


Starti

  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,


RMSE for baseline method constants 100 and 2: 0.9943735387732567


Starting training on baseline with 100 user bias and 5 item bias
RMSE for baseline method constants 100 and 5: 0.9961252431699398


Starting training on baseline with 100 user bias and 10 item bias
RMSE for baseline method constants 100 and 10: 1.0030518888536886


Starting training on baseline with 100 user bias and 15 item bias
RMSE for baseline method constants 100 and 15: 1.009509469119859


Starting training on baseline with 100 user bias and 20 item bias
RMSE for baseline method constants 100 and 20: 1.015025348013379


Starting training on baseline with 100 user bias and 25 item bias
RMSE for baseline method constants 100 and 25: 1.0197386593336626


Starting training on baseline with 100 user bias and 50 item bias
RMSE for baseline method constants 100 and 50: 1.035882014199841


Starting training on baseline with 100 user bias and 100 item bias
RMSE for baseline method constants 100 and 100: 1.0522716669362706


  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,
  sort=sort,


RMSE for baseline method constants 100 and 200: 1.0664742937734388


Starting training on baseline with 200 user bias and 1 item bias
RMSE for baseline method constants 200 and 1: 1.0048034674488977


Starting training on baseline with 200 user bias and 2 item bias
RMSE for baseline method constants 200 and 2: 1.0028116462108145


Starting training on baseline with 200 user bias and 5 item bias
RMSE for baseline method constants 200 and 5: 1.0050401219454987


Starting training on baseline with 200 user bias and 10 item bias
RMSE for baseline method constants 200 and 10: 1.0123212696832267


Starting training on baseline with 200 user bias and 15 item bias
RMSE for baseline method constants 200 and 15: 1.0189550238084213


Starting training on baseline with 200 user bias and 20 item bias
RMSE for baseline method constants 200 and 20: 1.0245749539142943


Starting training on baseline with 200 user bias and 25 item bias
RMSE for baseline method constants 200 and 25: 1.029355550217501




  sort=sort,
  sort=sort,
  sort=sort,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Filtragem Colaborativa

In [None]:
def fc_predict(func, train, test, k_neighbors, similarity, train_file):
    func_name = func.__name__
    
    print(f"\n\nStarting training on {func_name} with {k_neighbors} neighbors and {similarity} similarity")

    train_output = f'train_outputs/{func_name}_{similarity}_{"default" if k_neighbors is None else k_neighbors}k.dat'
    result_output = f'result_outputs/results_{train_output}'

    func(train_file, output_file=train_output, k_neighbors=k_neighbors,
            similarity_metric=similarity, as_similar_first=True).compute()

    preds = pd.read_csv(train_output, sep='\t', names=['user_id', 'movie_id', 'rating'])
    preds.set_index(['user_id', 'movie_id'], inplace=True)

    results = test.join(preds, how='left').reset_index()
    return results

In [None]:
def fc_run_tests(train, test, funcs, similarities, k_neighbors, cold_starts):
    test_results_file = 'test_results_FC.csv'
    
    real_ratings = test.reset_index().rating
    to_test = test.drop(columns=['rating'])

    for func in funcs:
        func_name = func.__name__
        for sim in similarities:
            for k in k_neighbors:
                results = fc_predict(func, train, to_test, k, sim, 'train.dat')
                for cold in cold_starts:
                    results = cold_start(results, train, 'movie_id', 'user_id', cold)
                    results = cold_start(results, train, 'user_id', 'movie_id', cold)
                    
                    qtd_nan = results.rating.isna().sum()
                    print(f"{qtd_nan} nan predictions from {results.shape[0]} total")

                    predicted = results.rating

                    used = np.logical_and(~real_ratings.isna(), ~predicted.isna())
                    rmse = mean_squared_error(real_ratings[used], predicted[used], squared=False)

                    msg = f"{func_name} with {k} neighbors, {sim} similarity and {cold} for cold start: RMSE = {rmse}"
                    print(msg)

                    with open(test_results_file, 'a') as f:
                        data = f"{func_name},{k},{sim},{cold},{rmse}\n"
                        f.write(data)

    files.download(test_results_file)

In [None]:
funcs = [ItemKNN, UserKNN]
cold_starts = ['average', 'mode']
similarities = ['cosine', 'hamming', 'jaccard']
k_neighbors = [1, 10, 50, 150, 300, 500]

fc_run_tests(train, eval, funcs, similarities, k_neighbors, cold_starts)



Starting training on ItemKNN with 1 neighbors and cosine similarity
[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 8.673682 sec
prediction_time:: 589.049220 sec
79 nan predictions from 16074 total
ItemKNN with 1 neighbors, cosine similarity and average for cold start: RMSE = 1.1028345304483298
79 nan predictions from 16074 total
ItemKNN with 1 neighbors, cosine similarity and mode for cold start: RMSE = 1.1075168172478238


Starting training on ItemKNN with 10 neighbors and cosine similarity
[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 8.514815 sec
prediction_time:: 209.163162 sec
79 nan predictions from 16074 total
ItemKNN with 10 neighbors, cosine similarity and average for cold start: RMSE = 1.2576163469907358
79 nan predictions from 16074 total
ItemKNN with 10 neigh

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Filtagem Baseada em Conteúdo

In [None]:
def generate_sim_file(movies_data, train, metric):
    sim_data = 1 - pd.DataFrame(pairwise_distances(movies_data, metric=metric))

    sim_data.index.name = "item_1"
    sim_data.reset_index(inplace=True)
    sim_data = sim_data.melt(id_vars=['item_1'], var_name="item_2", value_name="similarity")

    sim_data.item_1 = sim_data.item_1 + 1
    sim_data.item_2 = sim_data.item_2 + 1

    exists_in_train = sim_data.item_1.isin(train.movie_id) & sim_data.item_2.isin(train.movie_id)
    sim_data = sim_data[exists_in_train]

    sim_data.to_csv('similarity_data.dat', index=False, header=False, sep='\t')
    del sim_data

In [None]:
def fbc_predict(func, train, test, k_neighbors, similarity, train_file):
    func_name = func.__name__
    
    print(f"\n\nStarting training on {func_name} with {k_neighbors} neighbors and {similarity} similarity")

    train_output = f'train_outputs/{func_name}_{similarity}_{"default" if k_neighbors is None else k_neighbors}k.dat'
    result_output = f'result_outputs/results_{train_output}'

    func(train_file, output_file=train_output, k_neighbors=k_neighbors,
            similarity_file='similarity_data.dat', as_similar_first=True).compute()

    preds = pd.read_csv(train_output, sep='\t', names=['user_id', 'movie_id', 'rating'])
    preds.set_index(['user_id', 'movie_id'], inplace=True)

    results = test.join(preds, how='left').reset_index()
    return results

In [None]:
def fbc_run_tests(train, test, funcs, k_neighbors, similarities, cold_starts, content_data):
    global movies_data
    test_results_file = f'test_results_FBC_{content_data}.csv'

    real_ratings = test.reset_index().rating
    to_test = test.drop(columns=['rating'])

    for func in funcs:
        func_name = func.__name__
        for sim in similarities:
            generate_sim_file(movies_data, train, sim)

            for k in k_neighbors:
                results = fbc_predict(func, train, to_test, k, sim, 'train.dat')
                for cold in cold_starts:
                    results = cold_start(results, train, 'movie_id', 'user_id', cold)
                    results = cold_start(results, train, 'user_id', 'movie_id', cold)
                    
                    qtd_nan = results.rating.isna().sum()
                    print(f"{qtd_nan} nan predictions from {results.shape[0]} total")

                    predicted = results.rating

                    used = np.logical_and(~real_ratings.isna(), ~predicted.isna())
                    rmse = mean_squared_error(real_ratings[used], predicted[used], squared=False)

                    msg = f"{func_name} with {k} neighbors, {sim} similarity and {cold} for cold start: RMSE = {rmse}"
                    print(msg)

                    with open(test_results_file, 'a') as f:
                        data = f"{func_name},{k},{sim},{cold},{rmse}\n"
                        f.write(data)

    files.download(test_results_file)

### FBC-kNN utilizando o gênero dos filmes

In [None]:
# Creating attribute dataframe
def generate_genre_movies_data():
    movies_data = pd.read_csv('competition/movies_data.csv')
    movies_data.set_index('movie_id', inplace=True)

    movies_data.genres = movies_data.genres.str.split("|")
    movie_genres = movies_data["genres"].explode().unique()

    for genre in movie_genres:
        movies_data[genre] = movies_data.apply(lambda x: genre in x['genres'], axis=1).astype(int)

    movies_data.drop(columns=['title', 'genres'], inplace=True)
    return movies_data

In [None]:
movies_data = generate_genre_movies_data()

funcs = [ItemAttributeKNN]
cold_starts = ['average', 'mode']
k_neighbors = [1, 10, 50, 150, 300, 500]
similarities = ['cosine', 'euclidean', 'manhattan']
fbc_run_tests(train, eval, funcs, k_neighbors, similarities, cold_starts, 'Genre_Metadata')



Starting training on ItemAttributeKNN with 1 neighbors and cosine similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 8.763031 sec
prediction_time:: 41.599294 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 1 neighbors, cosine similarity and average for cold start: RMSE = 1.0036174719143627
79 nan predictions from 16074 total
ItemAttributeKNN with 1 neighbors, cosine similarity and mode for cold start: RMSE = 1.0087603921989294


Starting training on ItemAttributeKNN with 10 neighbors and cosine similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 8.803148 sec
prediction_time:: 53.062351 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 10 neighbors, cosine similarity and average for cold start: RMSE = 1.027110704

  rui = self.bui[user][item_j] + (rui / sim_sum)


prediction_time:: 207.303226 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 300 neighbors, euclidean similarity and average for cold start: RMSE = 1.1802890592822755
79 nan predictions from 16074 total
ItemAttributeKNN with 300 neighbors, euclidean similarity and mode for cold start: RMSE = 1.1846652533095583


Starting training on ItemAttributeKNN with 500 neighbors and euclidean similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 10.751205 sec


  rui = self.bui[user][item_j] + (rui / sim_sum)


prediction_time:: 360.810363 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 500 neighbors, euclidean similarity and average for cold start: RMSE = 1.1895144374948798
79 nan predictions from 16074 total
ItemAttributeKNN with 500 neighbors, euclidean similarity and mode for cold start: RMSE = 1.1938568155127822


Starting training on ItemAttributeKNN with 1 neighbors and manhattan similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 9.254017 sec
prediction_time:: 42.142978 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 1 neighbors, manhattan similarity and average for cold start: RMSE = 1.00395726077312
79 nan predictions from 16074 total
ItemAttributeKNN with 1 neighbors, manhattan similarity and mode for cold start: RMSE = 1.009098449308405


Starting training on ItemAttributeKNN with 10 neighbors and manhattan similarity
[Case Recomm

  rui = self.bui[user][item_j] + (rui / sim_sum)


prediction_time:: 49.870428 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 10 neighbors, manhattan similarity and average for cold start: RMSE = 1.0323979451748848
79 nan predictions from 16074 total
ItemAttributeKNN with 10 neighbors, manhattan similarity and mode for cold start: RMSE = 1.0373981955540919


Starting training on ItemAttributeKNN with 50 neighbors and manhattan similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 9.051651 sec


  rui = self.bui[user][item_j] + (rui / sim_sum)


prediction_time:: 76.872595 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 50 neighbors, manhattan similarity and average for cold start: RMSE = 1.105566005699164
79 nan predictions from 16074 total
ItemAttributeKNN with 50 neighbors, manhattan similarity and mode for cold start: RMSE = 1.1102367728980902


Starting training on ItemAttributeKNN with 150 neighbors and manhattan similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 9.174960 sec


  rui = self.bui[user][item_j] + (rui / sim_sum)


prediction_time:: 145.633842 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 150 neighbors, manhattan similarity and average for cold start: RMSE = 1.1657556692823965
79 nan predictions from 16074 total
ItemAttributeKNN with 150 neighbors, manhattan similarity and mode for cold start: RMSE = 1.170186215694583


Starting training on ItemAttributeKNN with 300 neighbors and manhattan similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 8.875921 sec


  rui = self.bui[user][item_j] + (rui / sim_sum)


prediction_time:: 203.333588 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 300 neighbors, manhattan similarity and average for cold start: RMSE = 1.1866005456336994
79 nan predictions from 16074 total
ItemAttributeKNN with 300 neighbors, manhattan similarity and mode for cold start: RMSE = 1.190953548143568


Starting training on ItemAttributeKNN with 500 neighbors and manhattan similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 10.845693 sec


  rui = self.bui[user][item_j] + (rui / sim_sum)


prediction_time:: 354.622378 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 500 neighbors, manhattan similarity and average for cold start: RMSE = 1.1802875874682393
79 nan predictions from 16074 total
ItemAttributeKNN with 500 neighbors, manhattan similarity and mode for cold start: RMSE = 1.1846637869324603


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### FBC-kNN utilizando reviews dos filmes

In [None]:
def generate_review_movies_data():
    nltk.download('punkt')
    nltk.download('stopwords')

    stop = set(stopwords.words('english'))
    stemmer = SnowballStemmer("english")

    vectorizer = TfidfVectorizer()

    # Reading
    movies_data = pd.read_csv('competition/movie_reviews.csv')
    movies_data.set_index('movie_id', inplace=True)

    # Normalizing
    movies_data.text = movies_data.text.str.replace('[^\w\s]','', regex=True).str.lower()

    # Tokenizing
    movies_data.text = movies_data.text.apply(word_tokenize)

    # Groups every movies reviews into one big string
    movies_data = movies_data.groupby('movie_id').sum()

    # Stemming and removing stopwords
    movies_data.text = movies_data.text.apply(lambda x: ' '.join([stemmer.stem(item) for item in x if item not in stop]))
    indexes = movies_data.index

    tfidf = vectorizer.fit_transform(movies_data.text)
    movies_data = pd.DataFrame(tfidf.toarray())
    movies_data.index = indexes

    return movies_data

In [None]:
movies_data = generate_review_movies_data()

funcs = [ItemAttributeKNN]
cold_starts = ['average', 'mode']
k_neighbors = [1, 10, 50, 150, 300, 500]
similarities = ['cosine', 'euclidean', 'manhattan']
fbc_run_tests(train, eval, funcs, k_neighbors, similarities, cold_starts, 'Review_Metadata')



Starting training on ItemAttributeKNN with 1 neighbors and cosine similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 11.368190 sec
prediction_time:: 48.798820 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 1 neighbors, cosine similarity and average for cold start: RMSE = 1.0048112859605716
79 nan predictions from 16074 total
ItemAttributeKNN with 1 neighbors, cosine similarity and mode for cold start: RMSE = 1.0099481270497503


Starting training on ItemAttributeKNN with 10 neighbors and cosine similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 11.224757 sec
prediction_time:: 56.443631 sec
79 nan predictions from 16074 total
ItemAttributeKNN with 10 neighbors, cosine similarity and average for cold start: RMSE = 1.0373434

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Fatoração de Matrizes

### SVD da Algebra Linear

In [None]:
def svd_predict(func, train, test, factors, train_file, test_file):
    func_name = func.__name__
    
    print(f"\n\nStarting training on {func_name} with {factors} factors")

    train_output = f'train_outputs/{func_name}_{factors}_factors.dat'
    result_output = f'result_outputs/results_{train_output}'

    func(train_file, test_file, output_file=train_output, factors=factors).compute()

    preds = pd.read_csv(train_output, sep='\t', names=['user_id', 'movie_id', 'rating'])
    preds.set_index(['user_id', 'movie_id'], inplace=True)

    results = test.join(preds, how='left').reset_index()
    return results

In [None]:
def svd_run_tests(train, test, func, num_factors):
    test_results_file = f'test_results_SVD.csv'

    real_ratings = test.reset_index().rating
    to_test = test.drop(columns=['rating'])

    func_name = func.__name__
    for factors in num_factors:

        results = svd_predict(func, train, to_test, factors, 'train.dat', 'test.dat')
        
        qtd_nan = results.rating.isna().sum()
        print(f"{qtd_nan} nan predictions from {results.shape[0]} total")

        predicted = results.rating

        used = np.logical_and(~real_ratings.isna(), ~predicted.isna())
        rmse = mean_squared_error(real_ratings[used], predicted[used], squared=False)

        msg = f"{func_name} with {factors} factors: RMSE = {rmse}"
        print(msg)

        with open(test_results_file, 'a') as f:
            data = f"{func_name},{factors},{rmse}\n"
            f.write(data)

    files.download(test_results_file)

In [None]:
num_factors = [1, 2, 5, 10, 20, 30, 40, 50, 75, 100, 125, 150, 200, 250, 500]
svd_run_tests(train, eval, SVD, num_factors)



Starting training on SVD with 1 factors
[Case Recommender: Rating Prediction > SVD]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
test data:: 3200 users and 2540 items (16074 interactions) | sparsity:: 99.80%

training_time:: 0.570385 sec
prediction_time:: 0.069045 sec


Eval:: MAE: 0.913083 RMSE: 1.107041 
0 nan predictions from 16074 total
SVD with 1 factors: RMSE = 1.107040927088219


Starting training on SVD with 2 factors
[Case Recommender: Rating Prediction > SVD]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
test data:: 3200 users and 2540 items (16074 interactions) | sparsity:: 99.80%

training_time:: 1.393595 sec
prediction_time:: 0.046584 sec


Eval:: MAE: 0.913358 RMSE: 1.107462 
0 nan predictions from 16074 total
SVD with 2 factors: RMSE = 1.107461866767029


Starting training on SVD with 5 factors
[Case Recommender: Rating Prediction > SVD]

train data:: 3679 users and 2930 items (37504 interactions) |

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Gradiente Descendente Estocastico

In [None]:
def sgd_predict(func, train, test, factors, epochs, delta, learn_rate, train_file, test_file):
    func_name = func.__name__
    
    print(f"\n\nStarting training on {func_name} with {factors} factors, {epochs} epochs, {delta} delta, {learn_rate} learn_rate")

    train_output = f'train_outputs/{func_name}_{factors}_{epochs}epochs_{delta}delta_{learn_rate}learnrate.dat'
    result_output = f'result_outputs/results_{train_output}'

    func(train_file, test_file, output_file=train_output, factors=factors).compute()

    preds = pd.read_csv(train_output, sep='\t', names=['user_id', 'movie_id', 'rating'])
    preds.set_index(['user_id', 'movie_id'], inplace=True)

    results = test.join(preds, how='left').reset_index()
    return results

In [None]:
def sgd_run_tests(train, test, func, num_factors, num_epochs, deltas, learn_rates):
    test_results_file = f'test_results_SGD.csv'

    real_ratings = test.reset_index().rating
    to_test = test.drop(columns=['rating'])

    func_name = func.__name__
    for factors in num_factors:
        for epochs in num_epochs:
            for delta in deltas:
                for learn_rate in learn_rates:

                    results = sgd_predict(func, train, to_test, factors, epochs, delta, learn_rate, 'train.dat', 'test.dat')
                    
                    qtd_nan = results.rating.isna().sum()
                    print(f"{qtd_nan} nan predictions from {results.shape[0]} total")

                    predicted = results.rating

                    used = np.logical_and(~real_ratings.isna(), ~predicted.isna())
                    rmse = mean_squared_error(real_ratings[used], predicted[used], squared=False)

                    msg = f"{func_name} with {factors} factors: RMSE = {rmse}"
                    print(msg)

                    with open(test_results_file, 'a') as f:
                        data = f"{func_name},{factors},{epochs},{delta},{learn_rate},{rmse}\n"
                        f.write(data)

    files.download(test_results_file)

In [None]:
deltas = [.005, .01, .05, .1, .2, .5]
num_epochs = [5, 10, 15, 20, 30, 50, 100] 
learn_rates = [.001, .005, .01, .03, .05]
num_factors = [1, 2, 5, 10, 30, 50, 100, 150, 200, 300, 500]

sgd_run_tests(train, eval, MatrixFactorization, num_factors, num_epochs, deltas, learn_rates)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Eval:: MAE: 0.909516 RMSE: 1.179692 
0 nan predictions from 16074 total
MatrixFactorization with 300 factors: RMSE = 1.179692251214156


Starting training on MatrixFactorization with 300 factors, 15 epochs, 0.5 delta, 0.01 learn_rate
[Case Recommender: Rating Prediction > Matrix Factorization]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
test data:: 3200 users and 2540 items (16074 interactions) | sparsity:: 99.80%

training_time:: 22.162014 sec
prediction_time:: 0.071169 sec


Eval:: MAE: 0.909658 RMSE: 1.179594 
0 nan predictions from 16074 total
MatrixFactorization with 300 factors: RMSE = 1.1795940428070122


Starting training on MatrixFactorization with 300 factors, 15 epochs, 0.5 delta, 0.03 learn_rate
[Case Recommender: Rating Prediction > Matrix Factorization]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
test data:: 3200 users and 2540 items

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Hibridização

In [None]:
def hybrid_predict(fbc_reviews, fbc_genre, w_reviews, w_genre):
    results = fbc_reviews.copy()
    results.rating = (fbc_reviews.rating * w_reviews) + (fbc_genre.rating * w_genre)
    return results

In [None]:
def run_hybrid_tests(train, test, weights):

    test_results_file = f'test_results_Hybrid-FBC.csv'

    real_ratings = eval.reset_index().rating
    to_test = eval.drop(columns=['rating'])    

    # Generate FBC by genre results
    movies_data = generate_genre_movies_data()
    generate_sim_file(movies_data, train, 'cosine')

    results_genre = fbc_predict(ItemAttributeKNN, train, to_test, 1, 'cosine', 'train.dat')
    results_genre = cold_start(results_genre, train, 'movie_id', 'user_id', 'average')
    results_genre = cold_start(results_genre, train, 'user_id', 'movie_id', 'average')

    results_genre = results_genre[['index', 'rating']]

    # Generate FBC by user reviews
    movies_data = generate_review_movies_data()
    generate_sim_file(movies_data, train, 'cosine')

    results_review = fbc_predict(ItemAttributeKNN, train, to_test, 1, 'cosine', 'train.dat')
    results_review = cold_start(results_review, train, 'movie_id', 'user_id', 'average')
    results_review = cold_start(results_review, train, 'user_id', 'movie_id', 'average')

    results_review = results_review[['index', 'rating']]

    for wa, wb in weights:

        res = hybrid_predict(results_review, results_genre, wa, wb)
        
        qtd_nan = res.rating.isna().sum()
        print(f"{qtd_nan} nan predictions from {res.shape[0]} total")
        
        predicted = res.rating

        used = np.logical_and(~real_ratings.isna(), ~predicted.isna())
        rmse = mean_squared_error(real_ratings[used], predicted[used], squared=False)

        msg = f"Hybrid-FBC with {wa} weight for reviews metadata prediction and {wb} for genres metadata prediction: RMSE = {rmse}"
        print(msg)

        with open(test_results_file, 'a') as f:
            data = f"Hybrid-FBC,{wa},{wb},{rmse}\n"
            f.write(data)
    
    files.download(test_results_file)

In [None]:
weights = [(0.05, 0.95), (0.1, 0.9), (0.15, 0.85), (0.2, 0.8), (0.25, 0.75),
           (0.3, 0.7), (0.35, 0.65), (0.4, 0.6), (0.45, 0.55), (0.5, 0.5),
           (0.55, 0.45), (0.6, 0.4), (0.65, 0.35), (0.7, 0.3), (0.75, 0.25),
           (0.8, 0.2), (0.85, 0.15), (0.9, 0.1), (0.95, 0.05)]

run_hybrid_tests(train, eval, weights)



Starting training on ItemAttributeKNN with 1 neighbors and cosine similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 19.978293 sec
prediction_time:: 47.370509 sec


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.




Starting training on ItemAttributeKNN with 1 neighbors and cosine similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3679 users and 2930 items (37504 interactions) | sparsity:: 99.65%
training_time:: 9.080787 sec
prediction_time:: 50.270591 sec
4 nan predictions from 16074 total
Hybrid-FBC with 0.05 weight for metadata prediction and 0.95 for genres metadata prediction: RMSE = 0.9945710835490648
4 nan predictions from 16074 total
Hybrid-FBC with 0.1 weight for metadata prediction and 0.9 for genres metadata prediction: RMSE = 0.9942058996863784
4 nan predictions from 16074 total
Hybrid-FBC with 0.15 weight for metadata prediction and 0.85 for genres metadata prediction: RMSE = 0.9938906294908005
4 nan predictions from 16074 total
Hybrid-FBC with 0.2 weight for metadata prediction and 0.8 for genres metadata prediction: RMSE = 0.9936253204740793
4 nan predictions from 16074 total
Hybrid-FBC with 0.25 weight for metadata prediction and 0.75 fo

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Algorithms Evaluation

In [None]:
def read_result_file(exp, columns):
    results_dir = '/content/drive/MyDrive/Sistemas de Recomendação/Trabalho Individual/Resultados de testes'
    return pd.read_csv(f"{results_dir}/test_results_{exp}.csv", names=columns)

def write_evaluation_file(results, filename):
    results_dir = '/content/drive/MyDrive/Sistemas de Recomendação/Trabalho Individual/Resultados de avaliações'
    
    result_file = f"{results_dir}/{filename}.dat"
    results.to_csv(result_file, index=False)
    
    return result_file

## Baseline

In [None]:
def predict_and_submit(train, test, u_const, i_const):
    base_train = train.copy()
    
    global_bias = train.rating.mean()
    base_train.rating = base_train.rating - global_bias
    
    results = baseline_predict(base_train, test, global_bias, u_const, i_const)
    results = results[['id', 'rating']]

    filename = f"baseline_algorithm_{u_const}_user_const_{i_const}_item_const"
    message = f"Testing baseline algorithm with {u_const} user const and {i_const} item const"


    results_file = write_evaluation_file(results, filename)

    print(f'Submiting {filename}...')
    !kaggle competitions submit -c scc5966 -f "$results_file" -m "$message"

In [None]:
cols = ['algorithm', 'u_const', 'i_const', 'rmse']
res_base = read_result_file('Baseline', cols)

In [None]:
top_5 = res_base.sort_values('rmse').head(5) # TOP 5
for index, row in top_5.iterrows():
    u_const = row['u_const']
    i_const = row['i_const']

    predict_and_submit(df_train, df_test, u_const, i_const)



Starting training on baseline with 10 user bias and 5 item bias
Submiting baseline_algorithm_10_user_const_5_item_const...


  sort=sort,


  0% 0.00/89.4k [00:00<?, ?B/s]100% 89.4k/89.4k [00:00<00:00, 375kB/s]
Successfully submitted to SCC5966

Starting training on baseline with 10 user bias and 2 item bias
Submiting baseline_algorithm_10_user_const_2_item_const...
100% 89.4k/89.4k [00:00<00:00, 419kB/s]
Successfully submitted to SCC5966

Starting training on baseline with 15 user bias and 5 item bias
Submiting baseline_algorithm_15_user_const_5_item_const...
100% 89.5k/89.5k [00:00<00:00, 474kB/s]
Successfully submitted to SCC5966

Starting training on baseline with 15 user bias and 2 item bias
Submiting baseline_algorithm_15_user_const_2_item_const...
100% 89.5k/89.5k [00:00<00:00, 458kB/s]
Successfully submitted to SCC5966

Starting training on baseline with 20 user bias and 2 item bias
Submiting baseline_algorithm_20_user_const_2_item_const...
100% 89.5k/89.5k [00:00<00:00, 423kB/s]
Successfully submitted to SCC5966

In [None]:
top_5

Unnamed: 0,algorithm,u_const,i_const,rmse
32,baseline,10,5,0.972644
31,baseline,10,2,0.973425
42,baseline,15,5,0.973482
41,baseline,15,2,0.973803
51,baseline,20,2,0.975236


## FC

In [None]:
cols = ['algorithm', 'k_neighbors', 'similarity', 'cold_start', 'rmse']
res_fc = read_result_file('FC', cols)

res_fc_item = res_fc[res_fc.algorithm == 'ItemKNN']
res_fc_user = res_fc[res_fc.algorithm == 'UserKNN']

In [None]:
def predict_and_submit(func, train, test, k_neighbors, similarity, cold):
    results = fc_predict(func, train, test, k_neighbors, similarity, 'full_train.dat')

    results = cold_start(results, train, 'movie_id', 'user_id', cold)
    results = cold_start(results, train, 'user_id', 'movie_id', cold)

    results = results[['id', 'rating']]

    filename = f"{func.__name__}_{k_neighbors}_neighbors_{similarity}_sim_{cold}_coldstart"
    message = f"Testing {func.__name__} with {k_neighbors} neighbors, {similarity} similarity and {cold} for cold start"

    results_file = write_evaluation_file(results, filename)

    print(f'Submiting {filename}...')
    !kaggle competitions submit -c scc5966 -f "$results_file" -m "$message"

### Item kNN

In [None]:
top_5 = res_fc_item.sort_values('rmse').head(5) # TOP 5
for index, row in top_5.iterrows():
    knn = row['k_neighbors']
    sim = row['similarity']
    cold = row['cold_start']

    predict_and_submit(ItemKNN, df_train, df_test, knn, sim, cold)



Starting training on ItemKNN with 1 neighbors and hamming similarity
[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
training_time:: 27.523827 sec
prediction_time:: 168.605538 sec
Submiting ItemKNN_1_neighbors_hamming_sim_average_coldstart...
100% 53.6k/53.6k [00:00<00:00, 283kB/s]
Successfully submitted to SCC5966

Starting training on ItemKNN with 1 neighbors and hamming similarity
[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
training_time:: 27.553903 sec
prediction_time:: 168.275801 sec
Submiting ItemKNN_1_neighbors_hamming_sim_mode_coldstart...
100% 52.1k/52.1k [00:00<00:00, 255kB/s]
Successfully submitted to SCC5966

Starting training on ItemKNN with 1 neighbors and cosine similarity
[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 3952 users and 3562 items (535784 interac

In [None]:
top_5

Unnamed: 0,algorithm,k_neighbors,similarity,cold_start,rmse
12,ItemKNN,1,hamming,average,1.017807
13,ItemKNN,1,hamming,mode,1.022879
0,ItemKNN,1,cosine,average,1.102835
1,ItemKNN,1,cosine,mode,1.107517
14,ItemKNN,10,hamming,average,1.166886


### User kNN

In [None]:
top_5 = res_fc_user.sort_values('rmse').head(5) # TOP 5
for index, row in top_5.iterrows():
    knn = row['k_neighbors']
    sim = row['similarity']
    cold = row['cold_start']

    predict_and_submit(UserKNN, df_train, df_test, knn, sim, cold)



Starting training on UserKNN with 500 neighbors and hamming similarity
[Case Recommender: Rating Prediction > UserKNN Algorithm]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
training_time:: 24.709789 sec
prediction_time:: 437.274205 sec
Submiting UserKNN_500_neighbors_hamming_sim_average_coldstart...
100% 53.1k/53.1k [00:00<00:00, 267kB/s]
Successfully submitted to SCC5966

Starting training on UserKNN with 300 neighbors and hamming similarity
[Case Recommender: Rating Prediction > UserKNN Algorithm]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
training_time:: 24.300260 sec
prediction_time:: 293.674478 sec
Submiting UserKNN_300_neighbors_hamming_sim_average_coldstart...
100% 52.7k/52.7k [00:00<00:00, 282kB/s]
Successfully submitted to SCC5966

Starting training on UserKNN with 150 neighbors and hamming similarity
[Case Recommender: Rating Prediction > UserKNN Algorithm]

train data:: 3952 users and 3562 items (

In [None]:
top_5

Unnamed: 0,algorithm,k_neighbors,similarity,cold_start,rmse
58,UserKNN,500,hamming,average,0.997829
56,UserKNN,300,hamming,average,0.997829
54,UserKNN,150,hamming,average,0.997829
52,UserKNN,50,hamming,average,0.998431
59,UserKNN,500,hamming,mode,1.003002


## FBC

In [None]:
def predict_and_submit(func, train, test, movies_data, k_neighbors, similarity, cold, metadata):

    generate_sim_file(movies_data, train, similarity)
    results = fbc_predict(func, train, test, k_neighbors, similarity, 'full_train.dat')

    results = cold_start(results, train, 'movie_id', 'user_id', cold)
    results = cold_start(results, train, 'user_id', 'movie_id', cold)

    results = results[['id', 'rating']]

    filename = f"{func.__name__}_{k_neighbors}_neighbors_{similarity}_sim_{cold}_coldstart_{metadata}_metadata"
    message = f"Testing {func.__name__} with {k_neighbors} neighbors, {similarity} similarity and {cold} for cold start using {metadata} metadata"

    results_file = write_evaluation_file(results, filename)

    print(f'Submiting {filename}...')
    !kaggle competitions submit -c scc5966 -f "$results_file" -m "$message"

### Genre Data

In [None]:
iaknn = read_result_file('FBC_Genre_Metadata', ['algorithm', 'k_neighbors', 'similarity', 'cold_start', 'rmse'])
movies_data = generate_genre_movies_data()

In [None]:
top_5 = iaknn.sort_values('rmse').head(5) # TOP 10

for index, row in top_5.iterrows():
    knn = row['k_neighbors']
    sim = row['similarity']
    cold = row['cold_start']

    predict_and_submit(ItemAttributeKNN, df_train, df_test, movies_data, knn, sim, cold, 'movies_genre')



Starting training on ItemAttributeKNN with 1 neighbors and cosine similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
training_time:: 27.722649 sec
prediction_time:: 172.321584 sec
Submiting ItemAttributeKNN_1_neighbors_cosine_sim_average_coldstart...
100% 53.5k/53.5k [00:00<00:00, 266kB/s]
Successfully submitted to SCC5966

Starting training on ItemAttributeKNN with 1 neighbors and euclidean similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
training_time:: 25.894890 sec
prediction_time:: 161.316154 sec
Submiting ItemAttributeKNN_1_neighbors_euclidean_sim_average_coldstart...
100% 53.5k/53.5k [00:00<00:00, 281kB/s]
Successfully submitted to SCC5966

Starting training on ItemAttributeKNN with 1 neighbors and manhattan similarity
[Case Recommender: Rating Prediction >

In [None]:
top_5

Unnamed: 0,algorithm,k_neighbors,similarity,cold_start,rmse
0,ItemAttributeKNN,1,cosine,average,1.003617
12,ItemAttributeKNN,1,euclidean,average,1.003957
24,ItemAttributeKNN,1,manhattan,average,1.003957
1,ItemAttributeKNN,1,cosine,mode,1.00876
13,ItemAttributeKNN,1,euclidean,mode,1.009098


### Reviews Data

In [None]:
cols = ['algorithm', 'k_neighbors', 'similarity', 'cold_start', 'rmse']
iaknn = read_result_file('FBC_Review_Metadata', cols)
movies_data = generate_review_movies_data()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
top_5 = iaknn.sort_values('rmse').head(5) # TOP 5

for index, row in top_5.iterrows():
    knn = row['k_neighbors']
    sim = row['similarity']
    cold = row['cold_start']

    predict_and_submit(ItemAttributeKNN, df_train, df_test, movies_data, knn, sim, cold, 'review_metadata')



Starting training on ItemAttributeKNN with 1 neighbors and cosine similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
training_time:: 23.539009 sec
prediction_time:: 167.829561 sec
Submiting ItemAttributeKNN_1_neighbors_cosine_sim_average_coldstart_review_metadata_metadata...
100% 53.5k/53.5k [00:00<00:00, 282kB/s]
Successfully submitted to SCC5966

Starting training on ItemAttributeKNN with 1 neighbors and manhattan similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
training_time:: 23.137151 sec
prediction_time:: 165.228969 sec
Submiting ItemAttributeKNN_1_neighbors_manhattan_sim_average_coldstart_review_metadata_metadata...
100% 53.3k/53.3k [00:00<00:00, 248kB/s]
Successfully submitted to SCC5966

Starting training on ItemAttributeKNN with 1 neighbors and euclidean

In [None]:
top_5

Unnamed: 0,algorithm,k_neighbors,similarity,cold_start,rmse
0,ItemAttributeKNN,1,cosine,average,1.004811
24,ItemAttributeKNN,1,manhattan,average,1.005569
12,ItemAttributeKNN,1,euclidean,average,1.005793
1,ItemAttributeKNN,1,cosine,mode,1.009948
25,ItemAttributeKNN,1,manhattan,mode,1.010702


## Fatoração de Matrizes

### SGD

In [None]:
def predict_and_submit(func, train, test, factors, epochs, delta, learn_rate):
    results = sgd_predict(func, train, test, factors, epochs, delta, learn_rate, 'full_train.dat', 'full_test.dat')

    results = results[['id', 'rating']]

    filename = f"SGD_{factors}_factors_{epochs}_epochs_{delta}_delta_{learn_rate}_learn_rate"
    message = f"Testing Stochastic Gradient Descent algorithm with {factors} factors, {epochs} epochs, {delta} delta and {learn_rate} learn rate"

    results_file = write_evaluation_file(results, filename)

    print(f'Submiting {filename}...')
    !kaggle competitions submit -c scc5966 -f "$results_file" -m "$message"

In [None]:
cols = ['algorithm', 'factors', 'epochs', 'delta', 'learn_rate', 'rmse']
sgd = read_result_file('SGD', cols)

In [None]:
top_5 = sgd.sort_values('rmse').head(5) # TOP 5

for index, row in top_5.iterrows():
    factors = row['factors']
    epochs = row['epochs']
    delta = row['delta']
    lr = row['learn_rate']

    predict_and_submit(MatrixFactorization, df_train, df_test, factors, epochs, delta, lr)



Starting training on MatrixFactorization with 10 factors, 50 epochs, 0.01 delta, 0.01 learn_rate
[Case Recommender: Rating Prediction > Matrix Factorization]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
test data:: 418 users and 1611 items (3970 interactions) | sparsity:: 99.41%

training_time:: 122.140172 sec
prediction_time:: 0.015579 sec


Eval:: MAE: 1980.855348 RMSE: 2288.488537 
Submiting SGD_10_factors_50_epochs_0.01_delta_0.01_learn_rate...
100% 52.6k/52.6k [00:00<00:00, 55.4kB/s]
Successfully submitted to SCC5966

Starting training on MatrixFactorization with 10 factors, 15 epochs, 0.5 delta, 0.03 learn_rate
[Case Recommender: Rating Prediction > Matrix Factorization]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
test data:: 418 users and 1611 items (3970 interactions) | sparsity:: 99.41%

training_time:: 126.739363 sec
prediction_time:: 0.015574 sec


Eval:: MAE: 1980.860048 RMSE: 2288.493245 
Submitin

In [None]:
top_5

Unnamed: 0,algorithm,factors,epochs,delta,learn_rate,rmse
787,MatrixFactorization,10,50,0.01,0.01,1.067373
718,MatrixFactorization,10,15,0.5,0.03,1.067717
708,MatrixFactorization,10,15,0.1,0.03,1.067731
744,MatrixFactorization,10,20,0.2,0.05,1.067954
733,MatrixFactorization,10,20,0.05,0.03,1.06804


### SVD

In [None]:
def predict_and_submit(func, train, test, factors):
    results = svd_predict(func, train, test, factors, 'full_train.dat', 'full_test.dat')
    results = results[['id', 'rating']]

    filename = f"SVD_{factors}_factors"
    message = f"Testing SVD algorithm with {factors} factors"

    results_file = write_evaluation_file(results, filename)

    print(f'Submiting {filename}...')
    !kaggle competitions submit -c scc5966 -f "$results_file" -m "$message"

In [None]:
cols = ['algorithm', 'factors', 'rmse']
svd = read_result_file('SVD', cols)

In [None]:
top_5 = svd.sort_values('rmse').head(5) # TOP 5

for index, row in top_5.iterrows():
    factors = row['factors']
    predict_and_submit(SVD, df_train, df_test, factors)



Starting training on SVD with 1 factors
[Case Recommender: Rating Prediction > SVD]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
test data:: 418 users and 1611 items (3970 interactions) | sparsity:: 99.41%

training_time:: 0.630391 sec
prediction_time:: 0.011410 sec


Eval:: MAE: 1980.605908 RMSE: 2288.269344 
Submiting SVD_1_factors...
100% 52.1k/52.1k [00:01<00:00, 48.5kB/s]
Successfully submitted to SCC5966

Starting training on SVD with 2 factors
[Case Recommender: Rating Prediction > SVD]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
test data:: 418 users and 1611 items (3970 interactions) | sparsity:: 99.41%

training_time:: 0.948915 sec
prediction_time:: 0.012030 sec


Eval:: MAE: 1980.572213 RMSE: 2288.23745 
Submiting SVD_2_factors...
100% 52.0k/52.0k [00:00<00:00, 54.7kB/s]
Successfully submitted to SCC5966

Starting training on SVD with 5 factors
[Case Recommender: Rating Prediction > SVD]

train data

In [None]:
top_5

Unnamed: 0,algorithm,factors,rmse
0,SVD,1,1.107041
1,SVD,2,1.107462
2,SVD,5,1.107909
3,SVD,10,1.111466
4,SVD,20,1.116066


## Hibridização FBC

In [None]:
def predict_and_submit(results_reviews, results_genre, weight_review, weight_genre):

    results = hybrid_predict(results_reviews, results_genre, weight_review, weight_genre)
    results = results[['id', 'rating']]

    filename = f"Hybrid-FBC_{weight_review}_weight_review_metadata_{weight_genre}_weight_genre_metadata"
    message = f"Testing Hybrid-FBC with {weight_review} for FBC reviews metadata based ratings and {weight_genre} for FBC genre metadata based ratings"

    results_file = write_evaluation_file(results, filename)

    print(f'Submiting {filename}...')
    !kaggle competitions submit -c scc5966 -f "$results_file" -m "$message"

    return

In [None]:
cols = ['algorithm', 'weight_reviews', 'weight_genres', 'rmse']
hyb = read_result_file('Hybrid-FBC', cols)

In [None]:
# Generate FBC by genre results
movies_data = generate_genre_movies_data()
generate_sim_file(movies_data, train, 'cosine')

results_genre = fbc_predict(ItemAttributeKNN, df_train, df_test, 1, 'cosine', 'full_train.dat')
results_genre = cold_start(results_genre, df_train, 'movie_id', 'user_id', 'average')
results_genre = cold_start(results_genre, df_train, 'user_id', 'movie_id', 'average')

results_genre = results_genre[['id', 'rating']]

# Generate FBC by user reviews
movies_data = generate_review_movies_data()
generate_sim_file(movies_data, train, 'cosine')

results_review = fbc_predict(ItemAttributeKNN, df_train, df_test, 1, 'cosine', 'full_train.dat')
results_review = cold_start(results_review, df_train, 'movie_id', 'user_id', 'average')
results_review = cold_start(results_review, df_train, 'user_id', 'movie_id', 'average')

results_review = results_review[['id', 'rating']]



Starting training on ItemAttributeKNN with 1 neighbors and cosine similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
training_time:: 38.165143 sec
prediction_time:: 185.773852 sec


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




Starting training on ItemAttributeKNN with 1 neighbors and cosine similarity
[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 3952 users and 3562 items (535784 interactions) | sparsity:: 96.19%
training_time:: 33.535511 sec
prediction_time:: 185.517024 sec


In [None]:
top_5 = hyb.sort_values('rmse').head(5) # TOP 5

for index, row in top_5.iterrows():
    wr = row['weight_reviews']
    wg = row['weight_genres']

    predict_and_submit(results_review, results_genre, wr, wg)

Submiting Hybrid-FBC_0.45_weight_review_metadata_0.55_weight_genre_metadata...
100% 66.7k/66.7k [00:03<00:00, 22.7kB/s]
Successfully submitted to SCC5966Submiting Hybrid-FBC_0.4_weight_review_metadata_0.6_weight_genre_metadata...
100% 57.7k/57.7k [00:02<00:00, 22.1kB/s]
Successfully submitted to SCC5966Submiting Hybrid-FBC_0.5_weight_review_metadata_0.5_weight_genre_metadata...
100% 55.0k/55.0k [00:03<00:00, 17.1kB/s]
Successfully submitted to SCC5966Submiting Hybrid-FBC_0.35_weight_review_metadata_0.65_weight_genre_metadata...
100% 59.8k/59.8k [00:01<00:00, 32.9kB/s]
Successfully submitted to SCC5966Submiting Hybrid-FBC_0.55_weight_review_metadata_0.45_weight_genre_metadata...
100% 66.5k/66.5k [00:02<00:00, 31.5kB/s]
Successfully submitted to SCC5966

In [None]:
top_5

Unnamed: 0,algorithm,weight_reviews,weight_genres,rmse
8,Hybrid-FBC,0.45,0.55,0.993049
7,Hybrid-FBC,0.4,0.6,0.993064
9,Hybrid-FBC,0.5,0.5,0.993084
6,Hybrid-FBC,0.35,0.65,0.99313
10,Hybrid-FBC,0.55,0.45,0.993169
