# Aquisição dos conjuntos de dados

In [None]:
# Download do pacote do Kaggle
! pip install -q kaggle

In [None]:
# Em sua conta do Kaggle, gere uma chave de API: isso fornecerá 
# um arquivo "kaggle.json". Faça upload dele nesta célula
from google.colab import files
files.upload()

In [None]:
# Processamento da licença
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Importação dos conjuntos de dados
! kaggle datasets download 'forgemaster/steam-reviews-dataset'
! kaggle datasets download 'nikdavis/steam-store-games'

Downloading steam-reviews-dataset.zip to /content
100% 1.93G/1.93G [01:05<00:00, 26.6MB/s]
100% 1.93G/1.93G [01:05<00:00, 31.7MB/s]
Downloading steam-store-games.zip to /content
 94% 33.0M/35.2M [00:01<00:00, 26.3MB/s]
100% 35.2M/35.2M [00:01<00:00, 22.0MB/s]


In [None]:
%%capture

# Unzip dos dados
! mkdir data
! unzip steam-reviews-dataset.zip -d data
! unzip steam-store-games.zip -d data

# Dependências

In [None]:
%%capture

# Microsoft Recommender
! pip install recommenders

# Case Recommender
! pip install caserecommender

In [None]:
# Ciência de dados geral
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# Para treinar os modelos neurais
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

# Manipulação de arquivos
import os

In [None]:
'''
# Caminhos dos arquivos finais
REVIEWS_DATA_PATH = "./data/reviews.csv"
GAMES_DATA_PATH = "./data/games.csv"
GAME_NAMES_DATA_PATH = "./data/game_names.csv"

# Caminhos dos arquivos de treino/validação/teste
TRAIN_DATA_PATH = "./data/train.csv"
TEST_DATA_PATH = "./data/test.csv"
TRAIN2_DATA_PATH = "./data/train2.csv"
TEST2_DATA_PATH = "./data/test2.csv"
'''

# Caminhos dos arquivos finais
REVIEWS_DATA_PATH = "./drive/MyDrive/Atividades - Graduação/Sistemas de Recomendação/reviews.csv"
GAMES_DATA_PATH = "./drive/MyDrive/Atividades - Graduação/Sistemas de Recomendação/games.csv"
GAME_NAMES_DATA_PATH = "./drive/MyDrive/Atividades - Graduação/Sistemas de Recomendação/game_names.csv"

# Caminhos dos arquivos de treino/validação/test
TRAIN_DATA_PATH = "./drive/MyDrive/Atividades - Graduação/Sistemas de Recomendação/train.csv"
TEST_DATA_PATH = "./drive/MyDrive/Atividades - Graduação/Sistemas de Recomendação/test.csv"
TRAIN2_DATA_PATH = "./drive/MyDrive/Atividades - Graduação/Sistemas de Recomendação/train2.csv"
TEST2_DATA_PATH = "./drive/MyDrive/Atividades - Graduação/Sistemas de Recomendação/test2.csv"

# Semente de geração pseudoaleatória
RANDOM_SEED = 16

# Pré-processamento dos dados

In [None]:
# Arquivos de interações explícitas
reviews_file_names = [name for name in os.listdir("./data/") if "reviews-" in name]

# Função de carregamento e de seleção de colunas
def load_and_process_data(file_name):
    reviews = pd.read_csv("./data/" + file_name)
    reviews = reviews.drop(axis=1, labels=reviews.columns[3:])
    return reviews

# Leitura, processamento e concatenação dos arquivos
reviews = load_and_process_data(reviews_file_names[0])
for i in range(1, len(reviews_file_names)):
    next_reviews = load_and_process_data(reviews_file_names[i])
    reviews = pd.concat((reviews, next_reviews))

reviews.head(10)

Unnamed: 0,steamid,appid,voted_up
0,76561198107294407,10,True
1,76561198011733201,10,True
2,76561198168961276,10,True
3,76561198957877160,10,True
4,76561199050314447,10,True
5,76561198888512591,10,True
6,76561199161043109,10,True
7,76561199057736010,10,True
8,76561198800558753,10,True
9,76561199010296112,10,True


In [None]:
# Leitura dos dados
games = pd.read_csv("./data/steam.csv")

# Separação dos nomes dos jogos
game_names = pd.DataFrame(games[["appid", "name"]])
games = games.drop(axis=1, labels="name")

# release_date --> year
games["year"] = games.release_date.apply(lambda x: x[:4])
games = games.drop(axis=1, labels="release_date")

# owners --> avg_owners
games["average_owners"] = games.owners.apply(lambda x: str(x).split('-')[1])
games = games.drop(axis=1, labels="owners")

# platforms --> one-hot encoding
platforms = games.platforms.str.get_dummies(';')
games = games.join(platforms)
games = games.drop(axis=1, labels="platforms")

# categories --> one-hot encoding
categories = pd.get_dummies(games.categories.str.get_dummies(";"))
games = games.join(categories)
games = games.drop(axis=1, labels="categories")

# genres --> one-hot encoding; 
# drop steamspy_tags
# drop developer and publisher (too many different categorical values)
genres = games.genres.str.get_dummies(";")
games = games.join(genres)
games = games.drop(axis=1, labels=["genres", "steamspy_tags", "developer", "publisher"])

# Salvamento dos nomes
game_names.to_csv(GAME_NAMES_DATA_PATH, index=False)

games

Unnamed: 0,appid,english,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,price,year,...,Sexual Content,Simulation,Software Training,Sports,Strategy,Tutorial,Utilities,Video Production,Violent,Web Publishing
0,10,1,0,0,124534,3339,17612,317,7.19,2000,...,0,0,0,0,0,0,0,0,0,0
1,20,1,0,0,3318,633,277,62,3.99,1999,...,0,0,0,0,0,0,0,0,0,0
2,30,1,0,0,3416,398,187,34,3.99,2003,...,0,0,0,0,0,0,0,0,0,0
3,40,1,0,0,1273,267,258,184,3.99,2001,...,0,0,0,0,0,0,0,0,0,0
4,50,1,0,0,5250,288,624,415,3.99,1999,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27070,1065230,1,0,7,3,0,0,0,2.09,2019,...,0,0,0,0,0,0,0,0,0,0
27071,1065570,1,0,0,8,1,0,0,1.69,2019,...,0,0,0,0,0,0,0,0,0,0
27072,1065650,1,0,24,0,1,0,0,3.99,2019,...,0,0,0,0,0,0,0,0,0,0
27073,1066700,1,0,0,2,0,0,0,5.19,2019,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Mantém somente as reviews de jogos presentes no dataset acima
reviews = reviews[reviews.appid.isin(games.appid.values)]
reviews = reviews.groupby("steamid").filter(lambda x: len(x) > 48) # Redução de exemplos para poder utilizar a RAM
reviews = reviews.groupby("appid").filter(lambda x: len(x) > 32) 
games = games[games.appid.isin(reviews.appid.unique())]

# Mapeamento dos ids e das notas
mapper = dict([(steamid, idx) for idx, steamid in enumerate(reviews.steamid.unique())])
reviews.steamid = reviews.steamid.map(mapper)
reviews.voted_up = reviews.voted_up.map({True:1, False:0})

# Mapeamento dos ids dos jogos
mapper = dict([appid, idx] for idx, appid in enumerate(games.appid.unique()))
reviews.appid = reviews.appid.map(mapper)
games.appid = games.appid.map(mapper)

# Mudança dos nomes das colunas do conjunto de avaliações
# Semelhança com o conjunto Movielens é necessária para alguns dos métodos
reviews.columns = ['userID', 'itemID', 'rating']

# Salvamento das bases de dados
reviews.to_csv(REVIEWS_DATA_PATH, index=False)
games.to_csv(GAMES_DATA_PATH, index=False, header=False)

reviews.shape, games.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


((358270, 3), (2541, 72))

In [None]:
# Separação em treino e teste
train, test = train_test_split(reviews, test_size=0.25, random_state=RANDOM_SEED)

# Garante estabilidade dos métodos
print("train size:", train.shape[0])
print("previous test size:", test.shape[0])
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]
print("current test size:", test.shape[0])

# Ordenação por ID de usuário
train = train.sort_values(by="userID")
test = test.sort_values(by="userID")

# Salvamento (para os modelos tradicionais, que não aceitam Header)
train.to_csv(TRAIN_DATA_PATH, index=False, header=False)
test.to_csv(TEST_DATA_PATH, index=False, header=False)

# Salvamento (para os modelos neurais, que requerem Header)
train.to_csv(TRAIN2_DATA_PATH, index=False, header=True)
test.to_csv(TEST2_DATA_PATH, index=False, header=True)

train size: 268702
previous test size: 89568
current test size: 89568


# Treino e avaliação dos modelos -- métodos tradicionais

In [None]:
# Constantes para controle de acesso aos índices
ITEM_KNN = "itemknn"
ITEM_ATTRIBUTE_KNN = "itemattrknn"
BPRMF = "bprmf"

# Para guardar os resultados
results = {
    ITEM_KNN: 0.0, 
    ITEM_ATTRIBUTE_KNN: 0.0, 
    BPRMF: 0.0,
}

## ItemKNN (filtragem colaborativa baseada em vizinhança)

In [None]:
from caserec.recommenders.item_recommendation.itemknn import ItemKNN

K_NEIGHBORS = 10

# Uso do ItemKNN
itemKNN_model = ItemKNN (
    train_file=TRAIN_DATA_PATH, 
    test_file=TEST_DATA_PATH, 
    sep=',',
    k_neighbors=K_NEIGHBORS, 
    similarity_metric='jaccard', 
    as_similar_first=True
)

itemKNN_model.compute()

[Case Recommender: Item Recommendation > ItemKNN Algorithm]

train data:: 5502 users and 2541 items (268702 interactions) | sparsity:: 98.08%
test data:: 5502 users and 2541 items (89568 interactions) | sparsity:: 99.36%

training_time:: 44.164109 sec
prediction_time:: 159.380115 sec


Eval:: PREC@1: 0.267539 PREC@3: 0.222525 PREC@5: 0.194075 PREC@10: 0.160505 RECALL@1: 0.017941 RECALL@3: 0.04393 RECALL@5: 0.063383 RECALL@10: 0.103903 MAP@1: 0.267539 MAP@3: 0.356143 MAP@5: 0.369845 MAP@10: 0.355926 NDCG@1: 0.267539 NDCG@3: 0.437896 NDCG@5: 0.467872 NDCG@10: 0.479173 


## ItemAttributeKNN (filtragem baseada em conteúdo)

In [None]:
from caserec.recommenders.item_recommendation.item_attribute_knn import ItemAttributeKNN

K_NEIGHBORS = 10

# Uso do ItemAttributeKNN
itemAttributeKNN_model = ItemAttributeKNN (
    train_file=TRAIN_DATA_PATH, 
    test_file=TEST_DATA_PATH, 
    metadata_file=GAMES_DATA_PATH, 
    sep=',', 
    metadata_similarity_sep=',',
    k_neighbors=K_NEIGHBORS, 
    similarity_metric='jaccard', 
    as_similar_first=True
)

itemAttributeKNN_model.compute()

[Case Recommender: Item Recommendation > Item Attribute KNN Algorithm]

train data:: 5502 users and 2541 items (268702 interactions) | sparsity:: 98.08%
test data:: 5502 users and 2541 items (89568 interactions) | sparsity:: 99.36%

training_time:: 2.954326 sec
>> metadata:: 2541 items and 1 metadata (2541 interactions) | sparsity:: 0.00%
prediction_time:: 152.865690 sec


Eval:: PREC@1: 0.021628 PREC@3: 0.016842 PREC@5: 0.019411 PREC@10: 0.022919 RECALL@1: 0.00125 RECALL@3: 0.002806 RECALL@5: 0.005403 RECALL@10: 0.013027 MAP@1: 0.021628 MAP@3: 0.029595 MAP@5: 0.036052 MAP@10: 0.044703 NDCG@1: 0.021628 NDCG@3: 0.037104 NDCG@5: 0.050626 NDCG@10: 0.072827 


# Treino e avaliação dos modelos -- métodos neurais

## Neural Collaborative Filtering

In [None]:
%%capture

! pip install tf_slim

In [None]:
# Dependências
from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k

# Leitura dos arquivos
train = pd.read_csv(TRAIN2_DATA_PATH)
test = pd.read_csv(TEST2_DATA_PATH)

# Hiperparâmetros
TOP_K = 10
BATCH_SIZE = 256

# Dados
ncf_data = NCFDataset(train_file=TRAIN2_DATA_PATH, test_file=TEST2_DATA_PATH, seed=RANDOM_SEED)

# Modelo
ncf_model = NCF (
    n_users=ncf_data.n_users, 
    n_items=ncf_data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    n_epochs=24,
    verbose=6,
    seed=RANDOM_SEED
)



In [None]:
# Treinamento
with Timer() as train_time:
    ncf_model.fit(ncf_data)
print("Took {} seconds for training.".format(train_time))

In [None]:
# Teste
with Timer() as test_time:

    # Cálculo das notas
    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(ncf_model.predict(user, item, is_list=True)))

    # Para mesclar os resultados
    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})
    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

    # Para aplicar as métricas
    all_predictions.prediction = all_predictions.prediction.apply(lambda x: 0 if x < 0.5 else 1)

print("Took {} seconds for prediction.".format(test_time))

In [None]:
# Métricas de desempenho
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print("MAP:\t%f" % eval_map, "NDCG:\t%f" % eval_ndcg)