# **Machine Learning - Proyecto en clase: Games**

# Librerías

## Librerías generales

In [1]:
import pandas as pd
import numpy as numpy
import time
import joblib
import os

## Librerías de Machine Learning

In [60]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Cargar datos limpios

In [2]:
ruta = r"C:\Users\fnaje\OneDrive\Documents\UniAndes\2do Seminario\seminario-proyecto-demo-games\data\processed\games_clean.csv"

In [5]:
games_clean = pd.read_csv(ruta)

In [6]:
games_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8296 entries, 0 to 8295
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   videogame_names            8296 non-null   object 
 1   platform                   8296 non-null   object 
 2   year_of_release            8296 non-null   int64  
 3   genre                      8296 non-null   object 
 4   na_sales                   8296 non-null   float64
 5   eu_sales                   8296 non-null   float64
 6   jp_sales                   8296 non-null   float64
 7   other_sales                8296 non-null   float64
 8   critic_score               8296 non-null   float64
 9   user_score                 8296 non-null   float64
 10  rating_esrb                8296 non-null   object 
 11  total_sales                8296 non-null   float64
 12  gen_platform               8296 non-null   object 
 13  classification_user_score  8296 non-null   objec

# Random Forest

## Preparar los datos para el modelo

In [7]:
games_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8296 entries, 0 to 8295
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   videogame_names            8296 non-null   object 
 1   platform                   8296 non-null   object 
 2   year_of_release            8296 non-null   int64  
 3   genre                      8296 non-null   object 
 4   na_sales                   8296 non-null   float64
 5   eu_sales                   8296 non-null   float64
 6   jp_sales                   8296 non-null   float64
 7   other_sales                8296 non-null   float64
 8   critic_score               8296 non-null   float64
 9   user_score                 8296 non-null   float64
 10  rating_esrb                8296 non-null   object 
 11  total_sales                8296 non-null   float64
 12  gen_platform               8296 non-null   object 
 13  classification_user_score  8296 non-null   objec

In [8]:
col_categoricas = ["platform", "genre", "rating_esrb", "gen_platform", "classification_user_score"]
col_numericas = ["year_of_release", "user_score", "critic_score"]

In [9]:
target = "total_sales"

In [10]:
X_categoricas = games_clean[col_categoricas]
X_numericas = games_clean[col_numericas]
y = games_clean[target]

In [11]:
X_categoricas.head()

Unnamed: 0,platform,genre,rating_esrb,gen_platform,classification_user_score
0,Wii,Sports,E,7ª Gen,Good
1,Wii,Racing,E,7ª Gen,Good
2,Wii,Sports,E,7ª Gen,Good
3,DS,Platform,E,7ª Gen,Excellent
4,Wii,Misc,E,7ª Gen,Regular


In [12]:
X_numericas.head()

Unnamed: 0,year_of_release,user_score,critic_score
0,2006,8.0,76.0
1,2008,8.3,82.0
2,2009,8.0,80.0
3,2006,8.5,89.0
4,2006,6.6,58.0


In [13]:
y

0       82.54
1       35.52
2       32.77
3       29.80
4       28.91
        ...  
8291     0.01
8292     0.01
8293     0.01
8294     0.01
8295     0.01
Name: total_sales, Length: 8296, dtype: float64

## Aplicación de One-Hot Encoding

In [14]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoder

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [15]:
X_categoricas_encoded = encoder.fit_transform(X_categoricas)

In [16]:
X_categoricas_encoded

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.]], shape=(8296, 55))

In [17]:
nuevas_columnas = encoder.get_feature_names_out(col_categoricas)
nuevas_columnas

array(['platform_2600', 'platform_3DS', 'platform_DC', 'platform_DS',
       'platform_GB', 'platform_GBA', 'platform_GC', 'platform_GEN',
       'platform_N64', 'platform_NES', 'platform_PC', 'platform_PS',
       'platform_PS2', 'platform_PS3', 'platform_PS4', 'platform_PSP',
       'platform_PSV', 'platform_SAT', 'platform_SNES', 'platform_Wii',
       'platform_WiiU', 'platform_X360', 'platform_XB', 'platform_XOne',
       'genre_Action', 'genre_Adventure', 'genre_Fighting', 'genre_Misc',
       'genre_Platform', 'genre_Puzzle', 'genre_Racing',
       'genre_Role-Playing', 'genre_Shooter', 'genre_Simulation',
       'genre_Sports', 'genre_Strategy', 'rating_esrb_AO',
       'rating_esrb_E', 'rating_esrb_E10+', 'rating_esrb_K-A',
       'rating_esrb_M', 'rating_esrb_RP', 'rating_esrb_T',
       'gen_platform_3ª Gen', 'gen_platform_4ª Gen',
       'gen_platform_5ª Gen', 'gen_platform_6ª Gen',
       'gen_platform_7ª Gen', 'gen_platform_8ª Gen',
       'gen_platform_Otras/Retro', 'gen

In [18]:
games_encoded = pd.DataFrame(
    X_categoricas_encoded, 
    columns = nuevas_columnas
)

In [19]:
print(f"Número de filas x columnas: {games_encoded.shape}")
display(games_encoded.head())

Número de filas x columnas: (8296, 55)


Unnamed: 0,platform_2600,platform_3DS,platform_DC,platform_DS,platform_GB,platform_GBA,platform_GC,platform_GEN,platform_N64,platform_NES,...,gen_platform_5ª Gen,gen_platform_6ª Gen,gen_platform_7ª Gen,gen_platform_8ª Gen,gen_platform_Otras/Retro,gen_platform_PC,classification_user_score_Bad,classification_user_score_Excellent,classification_user_score_Good,classification_user_score_Regular
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
X_numericas.head()

Unnamed: 0,year_of_release,user_score,critic_score
0,2006,8.0,76.0
1,2008,8.3,82.0
2,2009,8.0,80.0
3,2006,8.5,89.0
4,2006,6.6,58.0


In [21]:
X = pd.concat([X_numericas.reset_index(drop=True), games_encoded], axis=1)
X.head()

Unnamed: 0,year_of_release,user_score,critic_score,platform_2600,platform_3DS,platform_DC,platform_DS,platform_GB,platform_GBA,platform_GC,...,gen_platform_5ª Gen,gen_platform_6ª Gen,gen_platform_7ª Gen,gen_platform_8ª Gen,gen_platform_Otras/Retro,gen_platform_PC,classification_user_score_Bad,classification_user_score_Excellent,classification_user_score_Good,classification_user_score_Regular
0,2006,8.0,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2008,8.3,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2009,8.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2006,8.5,89.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2006,6.6,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [22]:
y.head()

0    82.54
1    35.52
2    32.77
3    29.80
4    28.91
Name: total_sales, dtype: float64

## Dividir los datos

In [23]:
len(games_clean)

8296

In [24]:
# definir variables para separar datos
RANDOM_STATE = 50 
TEST_SIZE = 0.25

In [25]:
# dividir datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [26]:
print(f"Tamaño X_train: {X_train.shape}")
print(f"Tamaño X_test: {X_test.shape}")
print(f"Tamaño y_train: {y_train.shape}")
print(f"Tamaño y_test: {y_test.shape}")

Tamaño X_train: (6222, 58)
Tamaño X_test: (2074, 58)
Tamaño y_train: (6222,)
Tamaño y_test: (2074,)


In [27]:
X_train.head()

Unnamed: 0,year_of_release,user_score,critic_score,platform_2600,platform_3DS,platform_DC,platform_DS,platform_GB,platform_GBA,platform_GC,...,gen_platform_5ª Gen,gen_platform_6ª Gen,gen_platform_7ª Gen,gen_platform_8ª Gen,gen_platform_Otras/Retro,gen_platform_PC,classification_user_score_Bad,classification_user_score_Excellent,classification_user_score_Good,classification_user_score_Regular
2686,2009,8.6,91.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5305,2005,7.1,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3139,2004,8.9,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
550,2007,7.5,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5583,2005,8.9,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Entrenar el modelo

In [28]:
# fijando parámetros del modelo
modelo = RandomForestRegressor(
    n_estimators=100, 
    random_state=RANDOM_STATE,
    n_jobs=-1, 
    oob_score=True
)

In [29]:
# fit es igual a entrenar el modelo
modelo.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
print(f"OOB Score (R2 estimado): {modelo.oob_score_}")

OOB Score (R2 estimado): 0.24970495384071423


## Evaluar el modelo

### RMSE

In [31]:
# predict es igual a predecir el modelo
predicciones = modelo.predict(X_test)

In [32]:
rmse = root_mean_squared_error(y_test, predicciones)
rmse

2.342057180748737

In [33]:
df_comparacion = pd.DataFrame({"Datos_Reales": y_test, "Predicción": predicciones}).reset_index(drop=True)

In [34]:
df_comparacion.head(20)

Unnamed: 0,Datos_Reales,Predicción
0,0.13,0.105
1,0.53,0.3069
2,0.11,0.2225
3,0.86,1.107
4,2.11,2.479
5,1.3,1.3471
6,0.34,0.1788
7,0.14,0.6545
8,0.11,0.327
9,0.04,0.0211


1. Seleccionamos los datos numéricos y categoricos. 
2. Los categóricos los pasamos a una matriz de 1 y 0 (OneHotEncoder)
3. Dividimos los datos - Datos de Entrenamiento (75%) y Datos de Prueba (25%)
4. Entreno los datos con mi modelo (cualquier modelo de ML), los datos de X (variables dependientes), y (variable independiente)
5. Evalúo con los datos de prueba solo con X (variables dependientes)
6. Saco métricas de qué tan bueno es mi modelo prediciendo los datos
6.1 Comparamos los datos reales vs la predicción. 

### MAE

In [35]:
mae_random_forest = mean_absolute_error(y_test, predicciones)
print(f"MAE (Random Forest): {mae_random_forest:.6f}")

MAE (Random Forest): 0.649060


### R-Cuadrado - Coeficiento de determinación

In [36]:
r2_random_forest = r2_score(y_test, predicciones)

print(f"R2 (Random Forest): {r2_random_forest:.6f}")

R2 (Random Forest): 0.106264


### MAPE - Error Procentual Abosuluto Medio

In [37]:
mape_random_forest = mean_absolute_percentage_error(y_test, predicciones)

print(f"MAPE (Random Forest): {mape_random_forest*100:.6f}")

MAPE (Random Forest): 360.474573


#### Comparación métricas

In [38]:
data_rf = [rmse, mae_random_forest, r2_random_forest, mape_random_forest]

index_metricas = ["RMSE", "MAE", "R-cuadrado", "MAPE"]

metricas_rf = pd.Series(
    data=data_rf,
    index=index_metricas, 
    name="Random Forest"
)

print("Métricas Random Forest")
print(metricas_rf)

Métricas Random Forest
RMSE          2.342057
MAE           0.649060
R-cuadrado    0.106264
MAPE          3.604746
Name: Random Forest, dtype: float64


# LightGBM

In [71]:
modelo_lgbm = LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1)

In [95]:
modelo_lgbm

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [72]:
modelo_lgbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 308
[LightGBM] [Info] Number of data points in the train set: 6222, number of used features: 45
[LightGBM] [Info] Start training from score 0.695715


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [73]:
predicciones_lgbm = modelo_lgbm.predict(X_test)

In [74]:
rmse_lgbm = root_mean_squared_error(y_test, predicciones_lgbm)
rmse_lgbm

2.304460911366699

In [75]:
mae_lgbm = mean_absolute_error(y_test, predicciones_lgbm)
print(f"MAE (Random Forest): {mae_lgbm:.6f}")

MAE (Random Forest): 0.618871


In [76]:
r2_lgbm = r2_score(y_test, predicciones_lgbm)

print(f"R2 (Random Forest): {r2_lgbm:.6f}")

R2 (Random Forest): 0.134727


In [77]:
mape_lgbm = mean_absolute_percentage_error(y_test, predicciones_lgbm)

print(f"MAPE (Random Forest): {mape_lgbm*100:.6f}")

MAPE (Random Forest): 320.086676


In [78]:
data_lgbm = [rmse_lgbm, mae_lgbm, r2_lgbm, mape_lgbm]

index_metricas = ["RMSE", "MAE", "R-cuadrado", "MAPE"]

metricas_lgbm = pd.Series(
    data=data_lgbm,
    index=index_metricas, 
    name="LGBM"
)

print("Métricas LGBM")
print(metricas_lgbm)

Métricas LGBM
RMSE          2.304461
MAE           0.618871
R-cuadrado    0.134727
MAPE          3.200867
Name: LGBM, dtype: float64


# XGBoost

In [49]:
modelo_xgb = XGBRegressor(random_state = RANDOM_STATE, n_jobs=-1)

In [50]:
modelo_xgb.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [51]:
predicciones_xgb = modelo_xgb.predict(X_test)

In [52]:
rmse_xgb = root_mean_squared_error(y_test, predicciones_xgb)
rmse_xgb

2.341065457242777

In [54]:
mae_xgb = mean_absolute_error(y_test, predicciones_xgb)
print(f"MAE (xgb): {mae_xgb:.6f}")

MAE (xgb): 0.630548


In [55]:
r2_xgb = r2_score(y_test, predicciones_xgb)

print(f"R2 (xgb): {r2_xgb:.6f}")

R2 (xgb): 0.107021


In [56]:
mape_xgb = mean_absolute_percentage_error(y_test, predicciones_xgb)

print(f"MAPE (xgb): {mape_xgb*100:.6f}")

MAPE (xgb): 331.401799


In [57]:
data_xgb = [rmse_xgb, mae_xgb, r2_xgb, mape_xgb]

index_metricas = ["RMSE", "MAE", "R-cuadrado", "MAPE"]

metricas_xgb = pd.Series(
    data=data_xgb,
    index=index_metricas, 
    name="XGB"
)

print("Métricas XGB")
print(metricas_xgb)

Métricas XGB
RMSE          2.341065
MAE           0.630548
R-cuadrado    0.107021
MAPE          3.314018
Name: XGB, dtype: float64


# Comparación de modelos

In [59]:
df_comparacion = pd.concat(
    [metricas_rf, metricas_lgbm, metricas_xgb], 
    axis=1
)

display(df_comparacion)

Unnamed: 0,Random Forest,LGBM,XGB
RMSE,2.342057,2.304461,2.341065
MAE,0.64906,0.618871,0.630548
R-cuadrado,0.106264,0.134727,0.107021
MAPE,3.604746,3.200867,3.314018


# GridSearchCV

In [None]:
param_grid = {
    "n_estimators": [100, 200], 
    "learning_rate": [0.1, 0.05], 
    "max_depth": [10, -1], 
    "num_leaves": [31, 50]
}

In [62]:
lgbm = LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1)

In [64]:
grid_search = GridSearchCV(
    estimator=lgbm, 
    param_grid=param_grid, 
    cv=6, 
    scoring="neg_root_mean_squared_error", 
    n_jobs= -1, 
    verbose=2
)

# tiempo
print("Iniciando GridSeachCV...")

start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()

print(f"GridSearchCV completo en {end_time - start_time:.2f} segundos")

Iniciando GridSeachCV...
Fitting 6 folds for each of 16 candidates, totalling 96 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 308
[LightGBM] [Info] Number of data points in the train set: 6222, number of used features: 45
[LightGBM] [Info] Start training from score 0.695715
GridSearchCV completo en 57.11 segundos


In [65]:
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)

Mejores parámetros encontrados:
{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'num_leaves': 31}


In [66]:
best_lgbm_model = grid_search.best_estimator_
best_lgbm_model

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,10
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [67]:
predicciones_best_lgbm = best_lgbm_model.predict(X_test)

In [83]:
rmse_best_lgbm = root_mean_squared_error(y_test, predicciones_best_lgbm)
rmse_best_lgbm

2.3112837494378744

In [84]:
mae_best_lgbm = mean_absolute_error(y_test, predicciones_best_lgbm)
print(f"MAE (lgbm): {mae_best_lgbm:.6f}")

MAE (lgbm): 0.621734


In [85]:
r2_best_lgbm = r2_score(y_test, predicciones_best_lgbm)

print(f"R2 (lgbm): {r2_best_lgbm:.6f}")

R2 (lgbm): 0.129596


In [86]:
mape_best_lgbm = mean_absolute_percentage_error(y_test, predicciones_best_lgbm)

print(f"MAPE (xgb): {mape_best_lgbm*100:.6f}")

MAPE (xgb): 318.156540


In [88]:
data_lgbm_optimizado = [
    rmse_best_lgbm, 
    mae_best_lgbm,
    r2_best_lgbm, 
    mape_best_lgbm
]

inxex_metricas = ["RMSE", "MAE", "R-cuadrado", "MAPE"]

metricas_lgbm_optimizado = pd.Series(
    data= data_lgbm_optimizado, 
    index=index_metricas, 
    name="LGBM (Optimizado)"
)

In [89]:
df_comparacion_lgbm = pd.concat(
    [metricas_lgbm, metricas_lgbm_optimizado], 
    axis=1
)

In [90]:
df_comparacion_lgbm

Unnamed: 0,LGBM,LGBM (Optimizado)
RMSE,2.304461,2.311284
MAE,0.618871,0.621734
R-cuadrado,0.134727,0.129596
MAPE,3.200867,3.181565


# Guardar archivos del mejor modelo

In [92]:
NOTEBOOK_DIR = os.getcwd()
NOTEBOOK_DIR

'c:\\Users\\fnaje\\OneDrive\\Documents\\UniAndes\\2do Seminario\\seminario-proyecto-demo-games\\script_prueba'

In [93]:
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, ".."))

MODEL_DIR = os.path.join(PROJECT_ROOT, "models")

ENCODER_PATH = os.path.join(MODEL_DIR, "onehot_encoder.joblib")

MODEL_PATH = os.path.join(MODEL_DIR, "lgbm_regressor_default.joblib")

In [94]:
os.makedirs(MODEL_DIR, exist_ok=True)

joblib.dump(encoder, ENCODER_PATH)
joblib.dump(modelo_lgbm, MODEL_PATH)

['c:\\Users\\fnaje\\OneDrive\\Documents\\UniAndes\\2do Seminario\\seminario-proyecto-demo-games\\models\\lgbm_regressor_default.joblib']

# Verificación de información del encoder

In [3]:
import joblib
import os

In [5]:
encoder_path = r"C:\Users\fnaje\OneDrive\Documents\UniAndes\2do Seminario\seminario-proyecto-demo-games\models\onehot_encoder.joblib"

In [6]:
encoder = joblib.load(encoder_path)

In [7]:
print(type(encoder))

<class 'sklearn.preprocessing._encoders.OneHotEncoder'>


In [8]:
encoder.categories_

[array(['2600', '3DS', 'DC', 'DS', 'GB', 'GBA', 'GC', 'GEN', 'N64', 'NES',
        'PC', 'PS', 'PS2', 'PS3', 'PS4', 'PSP', 'PSV', 'SAT', 'SNES',
        'Wii', 'WiiU', 'X360', 'XB', 'XOne'], dtype=object),
 array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
        'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
        'Strategy'], dtype=object),
 array(['AO', 'E', 'E10+', 'K-A', 'M', 'RP', 'T'], dtype=object),
 array(['3ª Gen', '4ª Gen', '5ª Gen', '6ª Gen', '7ª Gen', '8ª Gen',
        'Otras/Retro', 'PC'], dtype=object),
 array(['Bad', 'Excellent', 'Good', 'Regular'], dtype=object)]

In [10]:
print(list(encoder.categories_[0]))

['2600', '3DS', 'DC', 'DS', 'GB', 'GBA', 'GC', 'GEN', 'N64', 'NES', 'PC', 'PS', 'PS2', 'PS3', 'PS4', 'PSP', 'PSV', 'SAT', 'SNES', 'Wii', 'WiiU', 'X360', 'XB', 'XOne']


In [11]:
print(list(encoder.categories_[1]))

['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle', 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports', 'Strategy']


In [12]:
print(list(encoder.categories_[2]))

['AO', 'E', 'E10+', 'K-A', 'M', 'RP', 'T']
