In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Web Scrapping

- El archivo games que tenemos limpio se compone de varias columnas relevantes sobre cada videojuegos, sin embargo tenemos un problema a la hora de realizar el Content-based filtering (item-item) y es que no tenemos explicitamente lo mas importante, el contenido o algun texto que nos especifique de que trata cada producto. Lo que si poseemos es el link que nos redirije al apartado del videojuego en la pagina oficial de steam cuyo recurso utilizaremos para extraer la sinopsis del juego.

In [3]:
games = pd.read_parquet('./clean_datasets/all_dfs.parquet') # Importamos

In [4]:
games.drop(columns=['posted','user_id','user_url','item_id','recommend','review','sentiment','developer','price'],inplace=True) # Eliminamos las columnas que no necesitamos

In [5]:
print(f'Pre - Filas duplicadas: {games.duplicated().sum()}') ## Validamos si existen algunos registros duplicados
games.drop_duplicates(inplace=True) # Los eliminamos
games.reset_index(drop=True,inplace=True) # Reseteamos los indices
print(f'Pos - Filas duplicadas: {games.duplicated().sum()}') ## Verificamos

Pre - Filas duplicadas: 26839
Pos - Filas duplicadas: 0


In [23]:
pd.set_option('display.max_colwidth', None)
games.head() ## Vistazo rapido

Unnamed: 0,game_title,id,url,description
0,Killing Floor,1250,http://store.steampowered.com/app/1250/Killing_Floor/,"6-player co-op survival horror at its finest! Free updates, free special events and a ridiculous amount of fun!"
1,Zeno Clash,22200,http://store.steampowered.com/app/22200/Zeno_Clash/,Zeno Clash is an action/fighting game set in a punk fantasy world. The game is played from a first person perspective and the combat is generally up close and brutal. Experience a deep storyline set in a fantastic world.
2,Euro Truck Simulator 2,227300,http://store.steampowered.com/app/227300/Euro_Truck_Simulator_2/,"Travel across Europe as king of the road, a trucker who delivers important cargo across impressive distances! With dozens of cities to explore, your endurance, skill and speed will all be pushed to their limits."
3,"Papers, Please",239030,http://store.steampowered.com/app/239030/Papers_Please/,"Congratulations. The October labor lottery is complete. Your name was pulled. For immediate placement, report to the Ministry of Admission at Grestin Border Checkpoint. An apartment will be provided for you and your family in East Grestin. Expect a Class-8 dwelling."
4,Risk of Rain,248820,http://store.steampowered.com/app/248820/Risk_of_Rain/,"Risk of Rain is an action platformer with roguelike elements. With permanent death as a primary feature, players will have to play their best to get as far as possible. Fight on a mysterious planet with randomly spawning enemies and bosses, either alone or with 3 friends in online co-op."


In [7]:
all = [] ## Creamos la variable donde se almacenaran los fragemntos descriptivos de los videojuegos

for x in games.itertuples():
  page = requests.get(x.url) ## Accedemos a la pagina
  soup = BeautifulSoup(page.content,'html.parser') ## Configuramos la sopa para que pueda leer el contenido de la pagina
  res = soup.find('div',class_='game_description_snippet') ## Accedemos al div con la clase 'game_description_snippet' para obener el resumen del juego

  if res==None: # Para que el largor de la lista concuerde con la cantidad de registros en el DataFrame
    value = "_" 
  else:
    value = res.text.strip()
  
  all.append(value) ## Se agrega el resumen a la lista

games['description'] = pd.Series(all) ## Se añade al DataFrame

In [25]:
games.sample() ## Vistazo rapido

Unnamed: 0,game_title,id,url,description
1376,Plantera,421040,http://store.steampowered.com/app/421040/Plantera/,"In Plantera you build your own garden and watch it grow with new plants, bushes, trees and animals. As you play and expand your garden you will attract Helpers, round blue creatures that will help you with your harvesting tasks."


In [22]:
pd.reset_option('display.max_colwidth')

In [9]:
## Importamos las librerias necesarias

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [10]:
import nltk
from nltk.corpus import stopwords ## Para indicarle al modelo que palabras no poseen ningun significado o relevancia al texto
nltk.download('stopwords')

stop = stopwords.words('english') ## Nuestras descripcion estan en ingles asi que se necesitaran las palabras en ingles

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jmoc9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
tf = TfidfVectorizer(stop_words=stop) # Se le indica al modelo que palabras debe ignorar

In [12]:
tfidx_matrix = tf.fit_transform(games['description']) ## Se ajusta el modelo con nuestra columna de descripciones

In [13]:
tf.get_feature_names_out() ## Vistazo breve de terminos que han sido asignadas como columnas para la vectorizacion

array(['00', '000', '000km', ..., '紧接着又是母亲的突然离去', '虽然从未放弃过寻找',
       '让秦幽羽的父亲从此失踪'], dtype=object)

In [14]:
cosine_similarities = linear_kernel(tfidx_matrix,tfidx_matrix) ## Similitud de cosseno entre todos los pares de juegos

In [15]:
n = 5 ## Cantidad de juegos que queremos retornar

results = {} ## Diccionario donde las llaves seran cada juego y los valores los 5 mas parecidos al item

for idx,row in games.iterrows(): ## Se itera el Dataframe
  similar_indexs = cosine_similarities[idx].argsort()[:-n-2:-1] ## Para calcular los indices de los juegos mas similares al juego actual
  
  similar_items = [(f"{games['game_title'][i]}",round(cosine_similarities[idx][i], 3)) for i in similar_indexs] ## Valor de la llave que sera la lista de los juegos similares junto a sus similitudes
  results[f"{row['id']}"] = similar_items[1:] ## Crea la llave con su valor, omitiendo el primero de la lista que seria la similitud del juego consigo mismo (Daria 1.0)
 

In [16]:
game_similarities = pd.DataFrame(pd.Series(results))

In [17]:
game_similarities.to_dict()[0]['10']

[('Team Fortress Classic', 0.195),
 ('Metal Drift', 0.186),
 ('Skara - The Blade Remains', 0.138),
 ('Savage Resurrection', 0.128),
 ('Cyber Team Manager', 0.122)]

In [18]:
game_similarities.to_csv('./clean_datasets/games_similarities.csv')