# Importação das bibliotecas

In [99]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import stats
from google.colab import drive

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import sys
print(sys.getrecursionlimit())

import warnings
warnings.filterwarnings("ignore")

1000


In [3]:
sys.setrecursionlimit(10000)

# Carregamento do dataset

In [4]:
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [5]:
meta = pd.read_json('/content/gdrive/My Drive/Portfólio/Classification and Recommendation/meta_Video_Games.json',lines=True)
video = pd.read_json('/content/gdrive/My Drive/Portfólio/Classification and Recommendation/Video_Games_sample.json',lines=True)

In [6]:
meta.head(1)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Video Games, PC, Games]",,[],,Reversi Sensory Challenger,[],,Fidelity Electronics,[],"[>#2,623,937 in Toys &amp; Games (See Top 100 ...",[],Toys &amp; Games,,,,42000742,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,


In [7]:
video.head(2)

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,verified,vote,image
0,1,"01 2, 2018",A16FEXIKAPT24U,B00000JRSB,{'Format:': ' Video Game'},Nicholas Sabin,Game disc was cracked thats pretty lame,Game disc 1 was cracked,1514851200,,,
1,5,"01 2, 2018",A3ISBEPYLY8IMO,B00000JRSB,{'Format:': ' Video Game'},Jimmy,Received this in perfect condition. Great rese...,Received this in perfect condition. Great rese...,1514851200,1.0,,


In [8]:
video.dtypes

overall             int64
reviewTime         object
reviewerID         object
asin               object
style              object
reviewerName       object
reviewText         object
summary            object
unixReviewTime      int64
verified          float64
vote               object
image              object
dtype: object

In [9]:
video.isnull().sum()

overall                0
reviewTime             0
reviewerID             0
asin                   0
style             135402
reviewerName          51
reviewText           345
summary              166
unixReviewTime         0
verified           69016
vote              396439
image             448911
dtype: int64

# Limpeza dos dados

In [None]:
# Para os valores faltantes da coluna review tex vamos preencher com 'Not comment'
#video['reviewText'].fillna('Not comment', inplace=True)

In [None]:
# Da mesma forma para a colunna summary vamos preencher com 'Not summary'
#video['summary'].fillna('Not summary', inplace=True)

In [None]:
# Também para o reviewerName
#video['reviewerName'].fillna('Not Name', inplace=True)

In [None]:
# Igualmente para Image
#video['image'].fillna('Not image', inplace=True)

In [10]:
backup = video.copy()

In [11]:
video = backup.copy()

In [12]:
video.isnull().sum()

overall                0
reviewTime             0
reviewerID             0
asin                   0
style             135402
reviewerName          51
reviewText           345
summary              166
unixReviewTime         0
verified           69016
vote              396439
image             448911
dtype: int64

Como temos muitos valores nulos na coluna vote, vamos excluí-la

In [13]:
video.drop(columns=['vote', 'image'],inplace=True)

# Classificação

Na análise, observa-se vários valores faltantes na coluna verified, para preenche-lás, iremos fazer um modelo de classificação.

In [14]:
df_class = video[['overall','verified','unixReviewTime']].dropna()

In [15]:
X = df_class.drop(columns=['verified'])
Y = df_class[['verified']]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## KNN

In [17]:
results = {'' : ['KNN', 'Random Forest', 'Logistic Regression', 'Xgboosting'],
           'Accuracy' : [],
           'Precision' : [],
           'Recall' : [],
           'F1-score' : []}

In [18]:
params = {
    'n_neighbors' : [5, 6, 7, 10]
}

clf = GridSearchCV(KNeighborsClassifier(), param_grid=params, scoring='f1', cv=5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [5, 6, 7, 10]}, scoring='f1')

In [19]:
y_pred = clf.predict(X_test)

results['Accuracy'].append(accuracy_score(y_test, y_pred))
results['Precision'].append(precision_score(y_test, y_pred))
results['Recall'].append(recall_score(y_test, y_pred))
results['F1-score'].append(f1_score(y_test, y_pred))

In [20]:
clf_knn=clf.best_estimator_

## Random Forest

In [21]:
params = {
    'max_depth' : [2,5,7],
    'n_estimators' : [100,200,300],
    'class_weight' : [None, 'balanced']
}

clf = GridSearchCV(RandomForestClassifier(), param_grid=params, scoring='f1',cv=5)

clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'class_weight': [None, 'balanced'],
                         'max_depth': [2, 5, 7],
                         'n_estimators': [100, 200, 300]},
             scoring='f1')

In [22]:
y_pred = clf.predict(X_test)

results['Accuracy'].append(accuracy_score(y_test, y_pred))
results['Precision'].append(precision_score(y_test, y_pred))
results['Recall'].append(recall_score(y_test, y_pred))
results['F1-score'].append(f1_score(y_test, y_pred))

In [23]:
clf_rf = clf.best_estimator_

## Logistic Regression

In [24]:
params = {
    'penalty' : ['l1', 'l2'],
    'C' : [0.01, 0.1, 1],
    'class_weight' : [None, 'balanced']
}
clf = GridSearchCV(LogisticRegression(), param_grid=params, scoring='f1',cv=5)

clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.1, 1],
                         'class_weight': [None, 'balanced'],
                         'penalty': ['l1', 'l2']},
             scoring='f1')

In [25]:
y_pred = clf.predict(X_test)

results['Accuracy'].append(accuracy_score(y_test, y_pred))
results['Precision'].append(precision_score(y_test, y_pred))
results['Recall'].append(recall_score(y_test, y_pred))
results['F1-score'].append(f1_score(y_test, y_pred))

In [26]:
clf_lg = clf.best_estimator_

## Xgboosting

In [27]:
params = {
    'max_depth' : [6,7,8,9],
    'gamma' : [0, 0.01, 0.1]
}
clf = GridSearchCV(XGBClassifier(), param_grid=params, scoring='f1',cv=5)

clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=XGBClassifier(),
             param_grid={'gamma': [0, 0.01, 0.1], 'max_depth': [6, 7, 8, 9]},
             scoring='f1')

In [28]:
y_pred = clf.predict(X_test)

results['Accuracy'].append(accuracy_score(y_test, y_pred))
results['Precision'].append(precision_score(y_test, y_pred))
results['Recall'].append(recall_score(y_test, y_pred))
results['F1-score'].append(f1_score(y_test, y_pred))

In [29]:
clf_lg = clf.best_estimator_

In [30]:
total_results = pd.DataFrame(results)

total_results

Unnamed: 0,Unnamed: 1,Accuracy,Precision,Recall,F1-score
0,KNN,0.851385,0.869515,0.962343,0.913577
1,Random Forest,0.860876,0.867293,0.979419,0.919952
2,Logistic Regression,0.81624,0.81624,1.0,0.898824
3,Xgboosting,0.860876,0.867521,0.979069,0.919926


# Para prever os valores nulos da variável verified vamos utiliizar o modelo treinado Random Forest

In [31]:
df_nan = video[video['verified'].isnull()].reset_index().drop(columns=['index'])

In [32]:
df_nan.head(2)

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,verified
0,1,"01 2, 2018",A16FEXIKAPT24U,B00000JRSB,{'Format:': ' Video Game'},Nicholas Sabin,Game disc was cracked thats pretty lame,Game disc 1 was cracked,1514851200,
1,5,"08 28, 2017",A3562CUJCAN1JW,B00000JRSB,{'Format:': ' Video Game'},David,A classic!,Five Stars,1503878400,


In [34]:
X = df_nan.drop(columns=['verified', 'reviewTime', 'reviewerID',
                                'asin', 'style', 'reviewerName',
                               'reviewText', 'summary'])

In [35]:
df_nan['verified'] = clf_rf.predict(X)

Agora, vamos concatenar o dataaset original com o previsto

In [36]:
video.dropna(subset=['verified'],inplace=True)

In [37]:
video.isnull().sum()

overall                0
reviewTime             0
reviewerID             0
asin                   0
style             114843
reviewerName          40
reviewText           294
summary              139
unixReviewTime         0
verified               0
dtype: int64

In [38]:
df_final = pd.concat([video.reset_index().drop(columns=['index']), df_nan.reset_index().drop(columns=['index'])])

In [47]:
df_final.head(2)

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,verified
0,5,"01 2, 2018",A3ISBEPYLY8IMO,B00000JRSB,{'Format:': ' Video Game'},Jimmy,Received this in perfect condition. Great rese...,Received this in perfect condition. Great rese...,1514851200,1.0
1,5,"12 21, 2017",A1RIUB1PZWLOVZ,B00000JRSB,{'Format:': ' Video Game'},Amazon Customer,I had a hard time finding this game locally or...,I am very pleased with this purchase,1513814400,1.0


In [51]:
df_total = df_final.merge(meta[['category', 'description', 'title', 'also_buy',
                               'brand', 'also_view', 'main_cat', 'asin']], on='asin')

In [53]:
df_total.head(2)

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,verified,category,description,title,also_buy,brand,also_view,main_cat
0,5,"01 2, 2018",A3ISBEPYLY8IMO,B00000JRSB,{'Format:': ' Video Game'},Jimmy,Received this in perfect condition. Great rese...,Received this in perfect condition. Great rese...,1514851200,1.0,"[Video Games, Retro Gaming &amp; Microconsoles...",[Long recognized as role-playing games par exc...,Final Fantasy VII,"[B0000296O5, B00004Y57G, B00002R28C, B0045L3SN...",SquareSoft,"[B00004Y57G, B00004TC6E, B00ZS80PC2, B0000296O...",Video Games
1,5,"01 2, 2018",A3ISBEPYLY8IMO,B00000JRSB,{'Format:': ' Video Game'},Jimmy,Received this in perfect condition. Great rese...,Received this in perfect condition. Great rese...,1514851200,1.0,"[Video Games, Retro Gaming &amp; Microconsoles...",[Long recognized as role-playing games par exc...,Final Fantasy VII,"[B0000296O5, B00004Y57G, B00002R28C, B0045L3SN...",SquareSoft,"[B00004Y57G, B00004TC6E, B00ZS80PC2, B0000296O...",Video Games


# Sistema de recomendação

## Filtro colaborativo e fallback baseado em avaliações verificadas. (verified)

Primeiramente vamos calcular a quantidade de avaliação por produto baseado no asin e reviewerID em ordem decrescente para posteriormente usar como fallback(), que consiste em quanto o cliente pesquisar por um produto que não contenha na base de dados, vamos recomendar os 10 produtos mais vendidos.

In [60]:
data_count_products = df_total.groupby(by='asin', as_index=False) \
          .agg({'reviewerID' : pd.Series.nunique}) \
          .rename(columns = {'reviewerID' : 'Count_reviewerID'}) \
          .set_index('asin') \
          .sort_values('Count_reviewerID', ascending=False)

In [61]:
data_count_products.head(10)

Unnamed: 0_level_0,Count_reviewerID
asin,Unnamed: 1_level_1
B00HTK1NCS,6462
B004RMK57U,5069
B00KKAQYXM,4359
B00JJNQG98,3962
B003ZSP0WW,3960
B00178630A,3820
B00E4MQODC,3520
B000B9RI14,3345
B0009VXBAQ,3167
B00DBDPOZ4,3162


Agora pegando uma amostra com os 10 produtos mais vendidos e transformando em uma lista

In [85]:
data_sellers_poducts = data_count_products.head(10).index.tolist()

In [87]:
data_sellers_poducts

['B00HTK1NCS',
 'B004RMK57U',
 'B00KKAQYXM',
 'B00JJNQG98',
 'B003ZSP0WW',
 'B00178630A',
 'B00E4MQODC',
 'B000B9RI14',
 'B0009VXBAQ',
 'B00DBDPOZ4']

Agora, vamos calcular a contagem de valores verificados por produto

In [123]:
data_verified_count = df_total.groupby(by='asin', as_index=False) \
            .agg({'verified' : pd.Series.nunique}) \
            .rename(columns = {'verified' : 'verified_count'}) \
            .set_index('asin') \
            .sort_values('verified_count', ascending=False)

In [124]:
data_verified_count.head(10)

Unnamed: 0_level_0,verified_count
asin,Unnamed: 1_level_1
B00000JRSB,2
B00J48MUS4,2
B00FLLFJE0,2
B00FNKMVUO,2
B00FWK8UWI,2
B00GANWVJE,2
B00GMFKYK8,2
B00GODZYNA,2
B00GU4F4OM,2
B00GU8W5AE,2


Pegando uma amostra dos 2000 produtos mais bem avaliados e transformando em uma lista

In [125]:
top_products_verified_count = data_verified_count.head(2000).index.tolist()

In [126]:
top_products_verified_count[:10]

['B00000JRSB',
 'B00J48MUS4',
 'B00FLLFJE0',
 'B00FNKMVUO',
 'B00FWK8UWI',
 'B00GANWVJE',
 'B00GMFKYK8',
 'B00GODZYNA',
 'B00GU4F4OM',
 'B00GU8W5AE']

Fazendo para a base completa

In [127]:
df_total.groupby(by='asin', as_index=False) \
            .agg({'verified' : pd.Series.nunique}) \
            .rename(columns = {'verified' : 'verified_mean'}).head(20)

Unnamed: 0,asin,verified_mean
0,B00000JRSB,2
1,B00005NZ1G,2
2,B00005O0I2,2
3,B00005Q8M0,2
4,B00005TNI6,2
5,B000066TS5,2
6,B0000696CZ,2
7,B000087H7T,2
8,B00008J7NZ,2
9,B00009OY9U,2


### Matriz de relacionamento

In [128]:
matriz_de_relacionamento_top_products_verified_count = df_total[df_total['asin'].isin(top_products_verified_count)] \
  .pivot_table(index='asin', columns='reviewerID', values='verified').fillna(0)


In [129]:
matriz_de_relacionamento_top_products_verified_count.shape

(266, 361288)

In [130]:
matriz_de_relacionamento_top_products_verified_count.head()

reviewerID,A0003214FKMKJE0PCW3D,A0005622E33Y7RCVKKVN,A00065507CNSR8UHQFCK,A00101847G3FJTWYGNQA,A0011756FPL8K71Q5TAQ,A001932810S6RCIFHJ3V,A0025672ALSPPB1RH94H,A00356786CQNSUGGK2FA,A0039616ADOZ0KMWQRNX,A0049794AHFDZTYE50T0,...,AZZV9M0S13U26,AZZW3Z3NTGMY7,AZZWRUAQJ8BBE,AZZWV8SNTXQ7U,AZZWWSMIDFU6S,AZZX40NRMUDDQ,AZZX6JJHIO4UZ,AZZY2HR1LLRFV,AZZYR0PRM4RBT,AZZZOLOP9ZO8B
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00000JRSB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00005NZ1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00005O0I2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00005Q8M0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00005TNI6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [131]:
# Quantidade de avaliações por usuário
matriz_de_relacionamento_top_products_verified_count.sum()

reviewerID
A0003214FKMKJE0PCW3D    1.0
A0005622E33Y7RCVKKVN    3.0
A00065507CNSR8UHQFCK    1.0
A00101847G3FJTWYGNQA    1.0
A0011756FPL8K71Q5TAQ    1.0
                       ... 
AZZX40NRMUDDQ           1.0
AZZX6JJHIO4UZ           1.0
AZZY2HR1LLRFV           1.0
AZZYR0PRM4RBT           1.0
AZZZOLOP9ZO8B           0.0
Length: 361288, dtype: float64

### Matriz de similaridade de cossenos

In [132]:
cos_asin = cosine_similarity(matriz_de_relacionamento_top_products_verified_count)

print(cos_asin)
cos_asin.shape

[[1.         0.00410554 0.         ... 0.         0.         0.        ]
 [0.00410554 1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


(266, 266)

Podemos ver na matriz de similaridade alguma similaridade entre alguns produtos. Então, agora vamos criar o sistema de recomendação.

In [133]:
def recomendar(asin):
  print(f'Produto pesquisado: {asin}')
  try:
    asin_idx = matriz_de_relacionamento_top_products_verified_count.index.tolist().index(asin)
    closest_10_asin = np.argsort(-cos_asin[asin_idx])[1:11]
    print(f'Os 10 produtos mais próximos do asin: {asin} são :')
    return matriz_de_relacionamento_top_products_verified_count.index[closest_10_asin].tolist()
  except ValueError as e:
    print(f'{asin} não encontrado na matriz de recomendação. Retornando os 10 produtos mais avaliados')
    return data_sellers_poducts

Testando o sistema de recomendação com 1 asin existente e outro não

In [134]:
asin = 'B00005Q8M0'
asin_10_mais_proximos = recomendar(asin)

asin_10_mais_proximos

Produto pesquisado: B00005Q8M0
Os 10 produtos mais próximos do asin: B00005Q8M0 são :


['B000FQ9R4E',
 'B0017KIBAI',
 'B00005TNI6',
 'B0009VXBAQ',
 'B00DD0B0BM',
 'B0011E4J2Y',
 'B000FQ2DTA',
 'B0000696CZ',
 'B0000C7GHG',
 'B00EZPCX1A']

In [135]:
asin = 'asin_inexist'
asin_10_mais_proximos = recomendar(asin)

asin_10_mais_proximos

Produto pesquisado: asin_inexist
asin_inexist não encontrado na matriz de recomendação. Retornando os 10 produtos mais avaliados


['B00HTK1NCS',
 'B004RMK57U',
 'B00KKAQYXM',
 'B00JJNQG98',
 'B003ZSP0WW',
 'B00178630A',
 'B00E4MQODC',
 'B000B9RI14',
 'B0009VXBAQ',
 'B00DBDPOZ4']

Conseguimos criar um sistema de recomendação porém temos uma matriz esparsa, logo pode ser achado meio de diminuir essa esparsidão como levando a descrição do produto em consideração, a avaliação e a compra do produto.