In [126]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [127]:
# Copiar el archivo a tu entorno de trabajo
!cp /content/drive/MyDrive/Steam_2024_bestRevenue_1500.csv /content/

In [128]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Leer el archivo CSV
df = pd.read_csv('/content/Steam_2024_bestRevenue_1500.csv')

# Mostrar las primeras filas del DataFrame
df.head()

Unnamed: 0,name,releaseDate,copiesSold,price,revenue,avgPlaytime,reviewScore,publisherClass,publishers,developers,steamId
0,WWE 2K24,07-03-2024,165301,99.99,8055097.0,42.36514,71,AAA,2K,Visual Concepts,2315690
1,EARTH DEFENSE FORCE 6,25-07-2024,159806,59.99,7882151.0,29.651061,57,Indie,D3PUBLISHER,SANDLOT,2291060
2,Sins of a Solar Empire II,15-08-2024,214192,49.99,7815247.0,12.452593,88,Indie,Stardock Entertainment,"Ironclad Games Corporation,Stardock Entertainment",1575940
3,Legend of Mortal,14-06-2024,440998,19.99,7756399.0,24.797817,76,Indie,"Paras Games,Obb Studio Inc.",Obb Studio Inc.,1859910
4,Shin Megami Tensei V: Vengeance,13-06-2024,141306,59.99,7629252.0,34.258496,96,AA,SEGA,ATLUS,1875830


In [129]:
# Ver el número de observaciones (filas) y columnas
df.shape

(1500, 11)

In [130]:
# Obtener estadísticas descriptivas para las columnas numéricas
df.describe()

Unnamed: 0,copiesSold,price,revenue,avgPlaytime,reviewScore,steamId
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,141482.6,17.519513,2632382.0,12.562704,76.201333,2183788.0
std,1132757.0,12.646612,27810240.0,21.542173,24.319438,606772.5
min,593.0,0.0,20674.0,0.0,0.0,24880.0
25%,4918.75,9.99,45504.25,3.564848,72.0,1792795.0
50%,11928.5,14.99,109053.0,6.762776,83.0,2321985.0
75%,37869.75,19.99,455156.8,13.104473,92.0,2693228.0
max,30739150.0,99.99,837793400.0,296.332852,100.0,3107330.0


In [131]:
# Ver los tipos de datos de cada columna
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1500 non-null   object 
 1   releaseDate     1500 non-null   object 
 2   copiesSold      1500 non-null   int64  
 3   price           1500 non-null   float64
 4   revenue         1500 non-null   float64
 5   avgPlaytime     1500 non-null   float64
 6   reviewScore     1500 non-null   int64  
 7   publisherClass  1500 non-null   object 
 8   publishers      1499 non-null   object 
 9   developers      1498 non-null   object 
 10  steamId         1500 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 129.0+ KB


In [132]:
# Contar los valores únicos en columnas categóricas
print(df['publisherClass'].value_counts())
print(df['publishers'].value_counts())

publisherClass
Indie       1301
AA           146
AAA           52
Hobbyist       1
Name: count, dtype: int64
publishers
Kagura Games                     17
Electronic Arts                  16
072 Project                      14
Ubisoft                          13
Mango Party,Mango Party News     11
                                 ..
BLAMCAM Interactive               1
Soda Game Studio                  1
Frogstorm                         1
Significant Steak                 1
Red Nexus Games Inc.,IndieArk     1
Name: count, Length: 1131, dtype: int64


In [133]:
# Seleccionar solo las columnas numéricas
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Calcular la matriz de correlación
correlation_matrix = numerical_df.corr()

# Mostrar la matriz de correlación
correlation_matrix

Unnamed: 0,copiesSold,price,revenue,avgPlaytime,reviewScore,steamId
copiesSold,1.0,0.056224,0.6277,0.099065,0.008584,-0.056482
price,0.056224,1.0,0.162521,0.029053,-0.035025,-0.266997
revenue,0.6277,0.162521,1.0,0.082426,0.007383,-0.076866
avgPlaytime,0.099065,0.029053,0.082426,1.0,-0.03198,-0.118001
reviewScore,0.008584,-0.035025,0.007383,-0.03198,1.0,0.004844
steamId,-0.056482,-0.266997,-0.076866,-0.118001,0.004844,1.0


In [134]:
# Mostrar la matriz de correlación
print(correlation_matrix)

             copiesSold     price   revenue  avgPlaytime  reviewScore  \
copiesSold     1.000000  0.056224  0.627700     0.099065     0.008584   
price          0.056224  1.000000  0.162521     0.029053    -0.035025   
revenue        0.627700  0.162521  1.000000     0.082426     0.007383   
avgPlaytime    0.099065  0.029053  0.082426     1.000000    -0.031980   
reviewScore    0.008584 -0.035025  0.007383    -0.031980     1.000000   
steamId       -0.056482 -0.266997 -0.076866    -0.118001     0.004844   

              steamId  
copiesSold  -0.056482  
price       -0.266997  
revenue     -0.076866  
avgPlaytime -0.118001  
reviewScore  0.004844  
steamId      1.000000  


In [135]:
# Eliminar columnas categóricas no deseadas excepto 'publisherClass'
df = df.drop(['name', 'releaseDate', 'publishers', 'developers', 'steamId'], axis=1)

In [136]:
# Aplicar transformación logarítmica a características con alta variabilidad
df['log_copiesSold'] = np.log1p(df['copiesSold'])

In [137]:
# Realizar One-Hot Encoding a la columna 'publisherClass'
df_encoded = pd.get_dummies(df, columns=['publisherClass'], drop_first=True)

In [138]:
# Definir las características (X) y la variable objetivo continua (y)
X = df_encoded.drop('revenue', axis=1)  # Todas las columnas menos 'revenue'
y = df_encoded['revenue']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X,y)


In [155]:
#Se importa Mlflow para registrar los experimentos
!pip install mlflow
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error



In [153]:
mlflow.end_run()

In [180]:
# Configura la URI de MLflow para apuntar a tu servidor remoto
mlflow.set_tracking_uri("http://44.212.55.255:5000")


In [181]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Configurar el experimento en MLflow
experiment_name = "sklearn-diab"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = experiment.experiment_id

# Escalar las características numéricas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Configurar la búsqueda en cuadrícula y encontrar los mejores parámetros
param_grid_rf_regressor = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3]
}
rf_regressor = RandomForestRegressor(random_state=42)
grid_search_rf_regressor = GridSearchCV(estimator=rf_regressor, param_grid=param_grid_rf_regressor, cv=5, scoring='r2', n_jobs=-1)
grid_search_rf_regressor.fit(X_train_scaled, y_train)
best_rf_regressor = grid_search_rf_regressor.best_estimator_

# Registrar el experimento en MLflow
with mlflow.start_run(experiment_id=experiment_id):
    # Registrar los parámetros óptimos encontrados
    mlflow.log_param("n_estimators", best_rf_regressor.n_estimators)
    mlflow.log_param("max_depth", best_rf_regressor.max_depth)
    mlflow.log_param("min_samples_split", best_rf_regressor.min_samples_split)
    mlflow.log_param("min_samples_leaf", best_rf_regressor.min_samples_leaf)

    # Entrenar y hacer predicciones con el mejor modelo
    best_rf_regressor.fit(X_train_scaled, y_train)
    y_pred = best_rf_regressor.predict(X_test_scaled)

    # Calcular métricas y registrarlas en MLflow
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print("MAE registrado:", mae)
    print("RMSE registrado:", rmse)
    print("R^2 registrado:", r2)

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Registrar el modelo en MLflow con un ejemplo de entrada
    input_example = pd.DataFrame([X_test.iloc[0]])
    mlflow.sklearn.log_model(best_rf_regressor, "random-forest-model", input_example=input_example)

print("Experimento registrado con éxito en MLflow.")


MAE registrado: 1140514.6313757
RMSE registrado: 12015473.727012673
R^2 registrado: 0.8160564956266907


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/10 00:44:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run sincere-deer-58 at: http://44.212.55.255:5000/#/experiments/904012168856538984/runs/63ea9952a12447529ebca9f5c7fc0211.
2024/11/10 00:44:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://44.212.55.255:5000/#/experiments/904012168856538984.


Experimento registrado con éxito en MLflow.


In [178]:
# Configurar el experimento en MLflow
experiment_name = "sklearn-diab"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = experiment.experiment_id

# Ejecutar el experimento
with mlflow.start_run(experiment_id=experiment_id):
    # Definir los parámetros del modelo
    n_estimators = 500
    max_depth = 20
    min_samples_split = 3
    min_samples_leaf = 2

    # Registrar los parámetros en MLflow
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("min_samples_split", min_samples_split)
    mlflow.log_param("min_samples_leaf", min_samples_leaf)

    # Crear y entrenar el modelo
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Realizar predicciones
    y_pred = model.predict(X_test)

    # Calcular métricas y registrarlas en MLflow
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print("MAE registrado:", mae)
    print("RMSE registrado:", rmse)
    print("R^2 registrado:", r2)

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Registrar el modelo en MLflow
    mlflow.sklearn.log_model(model, "random-forest-model")

print("Experimento registrado con éxito en MLflow.")


MAE registrado: 2348099.8668317934
RMSE registrado: 22133098.46959896
R^2 registrado: 0.49351560163360086


2024/11/10 00:36:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run auspicious-fawn-702 at: http://44.212.55.255:5000/#/experiments/904012168856538984/runs/5036dd7e34ea4146bb42ad4138cf57c0.
2024/11/10 00:36:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://44.212.55.255:5000/#/experiments/904012168856538984.


Experimento registrado con éxito en MLflow.
