In [12]:
# Importaciones
import pandas as pd
pd.options.display.max_columns = None # Muestra todas las columnas de los dataframes
import json
import warnings
warnings.filterwarnings("ignore") # No muestra advertencias 
import re
import ast
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords') 
import pyarrow
import sklearn
import numpy as np


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\horac\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Lectura de datasets
df1 = pd.read_parquet('Tablas/ml_genres.parquet')
df2 = pd.read_parquet('Tablas/ml_reviews.parquet')
df4 = pd.read_parquet('Tablas/ml_title_item_id.parquet')


In [14]:
# Renombro y cambio tipo de columna para que funcionen los join's
df1.rename(columns={"id": "item_id"}, inplace=True)
df1['item_id'] = df1['item_id'].astype(str)

In [15]:
df1

Unnamed: 0,genres,item_id
0,Action,761140
1,Casual,761140
2,Indie,761140
3,Simulation,761140
4,Strategy,761140
...,...,...
68605,Indie,610660
68606,Racing,610660
68607,Simulation,610660
68608,Casual,658870


In [16]:
# Me quedo con las columnas que voy a necesitar únicamente
df3 = df2.drop(columns=['posted', 'last_edited', 'funny', 'helpful'])

In [17]:
# Reemplazo nulos por vacíos. Unifico tipo de datos para join
df3.fillna('')
df3['item_id'] = df3['item_id'].astype(str)

In [18]:
# Armo el dataset que voy a necesitar
df_join1 = df4.merge(df1, on="item_id", how="inner")
df_join = df3.merge(df_join1, on="item_id", how="inner")

In [19]:
# Reemplazo nulos por vacíos
df5 = df_join.fillna('')


In [20]:
# Elimino duplicados
df5.drop_duplicates()

Unnamed: 0,item_id,recommend,review,title,genres
0,1250,True,Simple yet with great replayability. In my opi...,Killing Floor,Action
1,1250,True,"I've played a lot of zombie games in my time, ...",Killing Floor,Action
2,1250,True,I can't wait for Killing Floor 2! I just love ...,Killing Floor,Action
3,1250,True,YES I RECCODMEEENDD THIS ♥♥♥♥♥ GAME,Killing Floor,Action
4,1250,True,wow killihg Floor I like,Killing Floor,Action
...,...,...,...,...,...
119983,16600,True,i like this alot it is fun and it dosent cost ...,Trials 2: Second Edition,Indie
119984,16600,True,i like this alot it is fun and it dosent cost ...,Trials 2: Second Edition,Racing
119985,16600,True,i like this alot it is fun and it dosent cost ...,Trials 2: Second Edition,Sports
119986,205080,False,BIT.TRIP FATE is a game that honestly made me ...,BIT.TRIP FATE,Action


In [21]:
# Cambio tipo bool a string, para luego procesar como texto
df5['recommend'] = df5['recommend'].astype(str)

In [22]:
df5

Unnamed: 0,item_id,recommend,review,title,genres
0,1250,True,Simple yet with great replayability. In my opi...,Killing Floor,Action
1,1250,True,"I've played a lot of zombie games in my time, ...",Killing Floor,Action
2,1250,True,I can't wait for Killing Floor 2! I just love ...,Killing Floor,Action
3,1250,True,YES I RECCODMEEENDD THIS ♥♥♥♥♥ GAME,Killing Floor,Action
4,1250,True,wow killihg Floor I like,Killing Floor,Action
...,...,...,...,...,...
119983,16600,True,i like this alot it is fun and it dosent cost ...,Trials 2: Second Edition,Indie
119984,16600,True,i like this alot it is fun and it dosent cost ...,Trials 2: Second Edition,Racing
119985,16600,True,i like this alot it is fun and it dosent cost ...,Trials 2: Second Edition,Sports
119986,205080,False,BIT.TRIP FATE is a game that honestly made me ...,BIT.TRIP FATE,Action


In [23]:
# Elimino aleatoriamente filas de df5. Me quedo con el 10%, para evitar problemas con el uso de memoria.
df5 = df5.sample(frac = 0.1)

In [24]:
df5

Unnamed: 0,item_id,recommend,review,title,genres
7044,440,True,tbh this game was better 2 years ago :/ thanks...,Team Fortress 2,Free to Play
14715,4000,False,There isn't much to do and it's boring... ther...,Garry's Mod,Indie
21796,304930,True,This game is Roblox + Dayz. Not ♥♥♥♥ing minecr...,Unturned,Casual
18693,304930,True,Draven,Unturned,Indie
78478,241540,True,Closest thing to a proper Walking Dead game th...,State of Decay,RPG
...,...,...,...,...,...
36117,252490,True,Get good computer to play or u will have a blo...,Rust,RPG
77456,312990,True,Ive only played 23 minutes of this and i alrea...,The Expendabros,Adventure
67721,237930,True,Buy this game. People have given it flak becau...,Transistor,Action
77852,250500,True,what a strange game this is,Super Amazing Wagon Adventure,Adventure


In [25]:
# Reseteo índice
df5.reset_index(drop=True, inplace=True)

In [26]:
# Creamos un nuevo dataframe con una única columna donde cada fila concatena los contenidos de todas las columnas que nos interesan.
todo = []
for i in range(0, df5.shape[0]):
    todo.append(df5['genres'][i]+' '+df5['item_id'][i]+' '+df5['title'][i]+' '+df5['review'][i])
df5['todo'] = todo

In [27]:
df5.head(1)

Unnamed: 0,item_id,recommend,review,title,genres,todo
0,440,True,tbh this game was better 2 years ago :/ thanks...,Team Fortress 2,Free to Play,Free to Play 440 Team Fortress 2 tbh this game...


In [28]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11999 entries, 0 to 11998
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   item_id    11999 non-null  object
 1   recommend  11999 non-null  object
 2   review     11999 non-null  object
 3   title      11999 non-null  object
 4   genres     11999 non-null  object
 5   todo       11999 non-null  object
dtypes: object(6)
memory usage: 562.6+ KB


In [29]:
# Creamos nuestro propio índice
df5.insert(1, "id", list(range(1, 12000)), True) 

In [30]:
df5

Unnamed: 0,item_id,id,recommend,review,title,genres,todo
0,440,1,True,tbh this game was better 2 years ago :/ thanks...,Team Fortress 2,Free to Play,Free to Play 440 Team Fortress 2 tbh this game...
1,4000,2,False,There isn't much to do and it's boring... ther...,Garry's Mod,Indie,Indie 4000 Garry's Mod There isn't much to do ...
2,304930,3,True,This game is Roblox + Dayz. Not ♥♥♥♥ing minecr...,Unturned,Casual,Casual 304930 Unturned This game is Roblox + D...
3,304930,4,True,Draven,Unturned,Indie,Indie 304930 Unturned Draven
4,241540,5,True,Closest thing to a proper Walking Dead game th...,State of Decay,RPG,RPG 241540 State of Decay Closest thing to a p...
...,...,...,...,...,...,...,...
11994,252490,11995,True,Get good computer to play or u will have a blo...,Rust,RPG,RPG 252490 Rust Get good computer to play or u...
11995,312990,11996,True,Ive only played 23 minutes of this and i alrea...,The Expendabros,Adventure,Adventure 312990 The Expendabros Ive only play...
11996,237930,11997,True,Buy this game. People have given it flak becau...,Transistor,Action,Action 237930 Transistor Buy this game. People...
11997,250500,11998,True,what a strange game this is,Super Amazing Wagon Adventure,Adventure,Adventure 250500 Super Amazing Wagon Adventure...


In [31]:
# Agregamos el índice como primer columna del dataset
df_new = df5[['id','todo']]

In [32]:
df_new

Unnamed: 0,id,todo
0,1,Free to Play 440 Team Fortress 2 tbh this game...
1,2,Indie 4000 Garry's Mod There isn't much to do ...
2,3,Casual 304930 Unturned This game is Roblox + D...
3,4,Indie 304930 Unturned Draven
4,5,RPG 241540 State of Decay Closest thing to a p...
...,...,...
11994,11995,RPG 252490 Rust Get good computer to play or u...
11995,11996,Adventure 312990 The Expendabros Ive only play...
11996,11997,Action 237930 Transistor Buy this game. People...
11997,11998,Adventure 250500 Super Amazing Wagon Adventure...


In [33]:
# Eliminamos conectores (a, an, are, etc.)
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [34]:
def text_preprocessing(column):
    # Convertimos a minúsculas
    column = column.str.lower()
    # Convertimos puntuaciones y símbolos extraños en vacíos
    column = column.str.replace('http\S+|www.\S+|@|%|:|,|', '', case=False)
    # Dividimos oraciones en palabras para aplicar las funciones previas
    word_tokens = column.str.split()
    keywords = word_tokens.apply(lambda x: [item for item in x if item not in stop])
    # Rearmamos las oraciones y les asignamos una nueva columna
    for i in range(len(keywords)):
        keywords[i] = " ".join(keywords[i])
        column = keywords

    return column

In [35]:
# Creamos una nueva columna con el texto ya procesado
df_new['cleaned_infos'] = text_preprocessing(df_new['todo'])

In [36]:
# Aplicamos similaridad del coseno
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# convierto una colección de documentos de texto en una matriz de conteo de tokens usando la clase CountVectorizer de la librería sklearn. Esta clase me permite extraer las características de los textos, como las palabras, los n-gramas, el vocabulario, etc. y crear una representación numérica de los mismos. El método fit_transform() combina los pasos de ajustar el modelo a los datos y transformar los datos en la matriz de conteo de tokens. El resultado es una matriz dispersa que contiene los valores de frecuencia de cada token en cada documento.
CV = CountVectorizer()
converted_matrix = CV.fit_transform(df_new['cleaned_infos'])
# Utilizamos la función cosine_similarity que nos devuelve una matriz cuadrada que contiene los valores de similitud de coseno entre cada par de vectores de la matriz de entrada. 
cosine_similarity = cosine_similarity(converted_matrix)

In [38]:
cosine_similarity

array([[1.        , 0.06454972, 0.0860663 , ..., 0.10127394, 0.08164966,
        0.08164966],
       [0.06454972, 1.        , 0.08333333, ..., 0.19611614, 0.07905694,
        0.        ],
       [0.0860663 , 0.08333333, 1.        , ..., 0.13074409, 0.10540926,
        0.        ],
       ...,
       [0.10127394, 0.19611614, 0.13074409, ..., 1.        , 0.12403473,
        0.        ],
       [0.08164966, 0.07905694, 0.10540926, ..., 0.12403473, 1.        ,
        0.2       ],
       [0.08164966, 0.        , 0.        , ..., 0.        , 0.2       ,
        1.        ]])

In [39]:
# Ingresamos un item_id para hallar recomendaciones
input_item = '427730'
item_id = df5[df5['item_id'] == input_item]['id'].values[0]
item_id

2582

In [40]:
# Obtengo una lista de pares (índice, valor) que representan la similitud de coseno entre el elemento con el índice item_id y cada uno de los demás elementos de la matriz cosine_similarity. La función enumerate() toma un iterable y devuelve un objeto que genera los pares (índice, valor) para cada elemento del iterable. La función list() convierte el objeto en una lista
score = list(enumerate(cosine_similarity[item_id]))

In [41]:
# Ordeno la lista score de forma descendente según el segundo elemento de cada par (índice, valor). La función sorted() toma un iterable y devuelve una lista ordenada. El argumento key permite especificar una función que se aplica a cada elemento del iterable antes de compararlo. El argumento reverse permite indicar si se quiere ordenar de forma ascendente (False) o descendente (True). En este caso, se usa una función lambda como valor de key, que toma como entrada x y devuelve x1, es decir, el segundo elemento de x
sorted_score = sorted(score, key=lambda x:x[1], reverse= True)

In [42]:
# Elimino el primer elemento de la lista pues obviamente el ítem más parecido al ítem ingresado es el mismo ítem.
sorted_score = sorted_score[1:]

In [43]:
# Scores agrupados por índice, en forma descendente
sorted_score[0:10]

[(1561, 0.2613541867446584),
 (8224, 0.2485772884799522),
 (549, 0.23953506879020425),
 (3996, 0.23162743094465488),
 (7320, 0.2151449915893437),
 (5287, 0.2090833493957267),
 (7480, 0.2090833493957267),
 (6889, 0.2056157234061163),
 (2175, 0.20328584951250753),
 (830, 0.20040941700985385)]

In [44]:
# Ítems más parecidos al ingresado, primeros 5, orden descendente.
i = 0
lista = []
for item in sorted_score:
    items = df5[df5['id'] == item[0]]['item_id'].values[0]
    #print(i+1,items)
    lista.append((i+1, items))
    i = i+1
    if i > 4:
        break
print(lista)

[(1, '221100'), (2, '360870'), (3, '99900'), (4, '212680'), (5, '63380')]
