In [39]:
import pandas as pd
import json
import ast
import warnings
from io import StringIO
import hashlib

from typing import List, Dict
import base64, csv

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

warnings.filterwarnings('ignore')

In [40]:
def showPie(columna):
  count_values = pd.Series(columna).value_counts()
  if len(count_values) > 15:
    return
  datos = pd.DataFrame({"valor":count_values.index, "ocurrencia": count_values.values})

  plt.title(columna.name)
  plt.pie(datos["ocurrencia"], labels=datos['valor'], autopct='%1.1f%%')
  plt.show()

def concatenar(data_1, data_2,  axis=1):
  return pd.concat([data_1, data_2], axis=axis)

def contar_nulos(data):
  return data.isna().sum()

def mapear(columna: pd.Series, mapa={'NO': 0, 'SI':1}):
  return columna.map(mapa)

In [3]:
URL_STEAM_GAMES = 'datasets/output_steam_games.json'
URL_USERS_ITEMS = 'datasets/output_steam_games.json'
URL_USERS_REVIEWS = 'datasets/output_steam_games.json'

In [219]:
df_games = pd.read_json(URL_STEAM_GAMES, lines=True)

In [220]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 12.9+ MB


#### 1. Limpiamos filas con valores todos nulos

In [221]:
df_games = df_games[df_games.isna().sum(axis=1) != 13]

In [222]:
len(df_games)

32135

Verificamos duplicados

In [223]:
df_games['hash'] = df_games.apply(lambda row: hashlib.md5(row.astype(str).values.tobytes()).hexdigest(), axis=1)

In [224]:
df_games[df_games['hash'].duplicated()]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer,hash


In [225]:
df_games = df_games.drop('hash', axis=1)

#### 2. Reseteamos el index

In [226]:
df_games = df_games.reset_index(drop=True)
# df_games = df_games.drop('index', axis=1);

In [227]:
showPie(df_games.publisher)

In [228]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 3.2+ MB


In [229]:
df_games = df_games.drop(['url', 'reviews_url', 'title'], axis=1)

In [230]:
df_games.loc[30961,'id'] = 200260
df_games.drop(74,inplace=True)

In [231]:
df_games[df_games.id.isna()]

Unnamed: 0,publisher,genres,app_name,release_date,tags,specs,price,early_access,id,developer


In [232]:
df_games['id'] =df_games['id'].astype('int').values

In [233]:
df_games.early_access = df_games.early_access.astype('bool').values

In [234]:
def isnumber(x):
    try:
        x = float(x)
        return x
    except:
        return 0

In [235]:
df_games.price = df_games.price.apply(isnumber)

In [240]:
df_games[df_games.genres.isna()]['genres'] = '[Generic]'

In [243]:
set_genres = set()

In [249]:
for i in df_games.index:
   try:
      for g in df_games.loc[i].genres:
         try:
            set_genres.add(g)
         except:
            pass
      for g in df_games.loc[i].tags:
         try:
            set_genres.add(g)
         except:
            pass
   except:
      pass

In [251]:
len(set_genres)

339

In [248]:
df_games.genres

0            [Action, Casual, Indie, Simulation, Strategy]
1                     [Free to Play, Indie, RPG, Strategy]
2        [Casual, Free to Play, Indie, Simulation, Sports]
3                              [Action, Adventure, Casual]
4                                                     None
                               ...                        
32130                [Casual, Indie, Simulation, Strategy]
32131                            [Casual, Indie, Strategy]
32132                          [Indie, Racing, Simulation]
32133                                      [Casual, Indie]
32134                                                 None
Name: genres, Length: 32134, dtype: object

In [214]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32134 entries, 0 to 32134
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        32134 non-null  object 
 2   app_name      32133 non-null  object 
 3   release_date  30068 non-null  object 
 4   tags          31972 non-null  object 
 5   specs         31465 non-null  object 
 6   price         32134 non-null  float64
 7   early_access  32134 non-null  bool   
 8   id            32134 non-null  int32  
 9   developer     28836 non-null  object 
dtypes: bool(1), float64(1), int32(1), object(7)
memory usage: 3.4+ MB


In [258]:
df_games[df_games['genres'] == "['generic']"]

Unnamed: 0,publisher,genres,app_name,release_date,tags,specs,price,early_access,id,developer


In [261]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

# Sample idioms (replace with your own idioms)
idioms = list(set_genres)

# Step 1: Vectorize the idioms using TF-IDF
vectorizer = TfidfVectorizer()
idiom_vectors = vectorizer.fit_transform(idioms)

# Step 2: Calculate cosine similarity
cosine_sim_matrix = cosine_similarity(idiom_vectors)

# Step 3: Apply K-Means Clustering
num_clusters = 100  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(cosine_sim_matrix)

# Get cluster labels for idioms
cluster_labels = kmeans.labels_

# Print the idioms along with their cluster labels
for idiom, label in zip(idioms, cluster_labels):
    print(f"{idiom} : Cluster {label}")


Space : Cluster 18
Perma Death : Cluster 62
Supernatural : Cluster 62
Dark : Cluster 35
Demons : Cluster 68
Building : Cluster 41
Post-apocalyptic : Cluster 94
1980s : Cluster 62
Hand-drawn : Cluster 62
Tanks : Cluster 63
Batman : Cluster 62
Board Game : Cluster 1
Crime : Cluster 62
Word Game : Cluster 1
4X : Cluster 58
Arcade : Cluster 62
Movie : Cluster 62
Time Attack : Cluster 5
2D Fighter : Cluster 29
Design & Illustration : Cluster 28
Classic : Cluster 79
Comic Book : Cluster 62
Dungeon Crawler : Cluster 77
Gothic : Cluster 62
Platformer : Cluster 4
Web Publishing : Cluster 62
Modern : Cluster 62
Sokoban : Cluster 19
Mouse only : Cluster 62
Steam Machine : Cluster 62
Mod : Cluster 75
Psychedelic : Cluster 62
Parkour : Cluster 48
On-Rails Shooter : Cluster 8
Vampire : Cluster 62
Conspiracy : Cluster 62
Cartoon : Cluster 62
Shoot 'Em Up : Cluster 30
NSFW : Cluster 62
1990's : Cluster 31
Relaxing : Cluster 62
Golf : Cluster 26
Medieval : Cluster 62
Cinematic : Cluster 62
Satire : Clu

In [264]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

# Sample idioms (replace with your own idioms)
idioms = [
    "kick the bucket",
    "bite the bullet",
    "hit the sack",
    # Add more idioms...
]

# Load pre-trained word2vec model (replace 'path_to_word2vec_model' with your file path)
model = Word2Vec.load('path_to_word2vec_model')

# Calculate embeddings for each idiom
idiom_embeddings = [model.wv[idiom.split()] for idiom in idioms]

# Apply K-Means Clustering
num_clusters = 5  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(idiom_embeddings)

# Get cluster labels for idioms
cluster_labels = kmeans.labels_

# Print the idioms along with their cluster labels
for idiom, label in zip(idioms, cluster_labels):
    print(f"{idiom} : Cluster {label}")

FileNotFoundError: [Errno 2] No such file or directory: 'path_to_word2vec_model'

In [267]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

# Sample idioms (replace with your own idioms)
idioms = list(set_genres)

# Step 1: Create the Bag-of-Words Matrix
vectorizer = CountVectorizer()
bag_of_words_matrix = vectorizer.fit_transform(idioms)

# Step 2: Apply K-Means Clustering
num_clusters = 100  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(bag_of_words_matrix)

# Get cluster labels for idioms
cluster_labels = kmeans.labels_

# Print the idioms along with their cluster labels
for idiom, label in zip(idioms, cluster_labels):
    print(f"{idiom} : Cluster {label}")


Space : Cluster 0
Perma Death : Cluster 0
Supernatural : Cluster 0
Dark : Cluster 2
Demons : Cluster 0
Building : Cluster 62
Post-apocalyptic : Cluster 0
1980s : Cluster 0
Hand-drawn : Cluster 17
Tanks : Cluster 0
Batman : Cluster 0
Board Game : Cluster 1
Crime : Cluster 0
Word Game : Cluster 1
4X : Cluster 0
Arcade : Cluster 0
Movie : Cluster 0
Time Attack : Cluster 5
2D Fighter : Cluster 37
Design & Illustration : Cluster 25
Classic : Cluster 50
Comic Book : Cluster 98
Dungeon Crawler : Cluster 39
Gothic : Cluster 0
Platformer : Cluster 6
Web Publishing : Cluster 0
Modern : Cluster 74
Sokoban : Cluster 0
Mouse only : Cluster 71
Steam Machine : Cluster 0
Mod : Cluster 0
Psychedelic : Cluster 73
Parkour : Cluster 0
On-Rails Shooter : Cluster 35
Vampire : Cluster 0
Conspiracy : Cluster 0
Cartoon : Cluster 0
Shoot 'Em Up : Cluster 19
NSFW : Cluster 0
1990's : Cluster 0
Relaxing : Cluster 0
Golf : Cluster 99
Medieval : Cluster 0
Cinematic : Cluster 0
Satire : Cluster 0
Abstract : Cluster 