Modelo de prediccion para recibir el precio y RMSE

In [92]:
# importar librerias
import ast
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_squared_error
import numpy as np
import pickle

In [93]:
# Carga del dataframe
data = pd.read_csv("clean_games.csv", sep=';', encoding='utf-8')

In [94]:
# Info del dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            32133 non-null  float64
 1   publisher     32116 non-null  object 
 2   genres        28852 non-null  object 
 3   app_name      32135 non-null  object 
 4   tags          31972 non-null  object 
 5   specs         31465 non-null  object 
 6   early_access  32135 non-null  bool   
 7   developer     32135 non-null  object 
 8   sentiment     32135 non-null  object 
 9   metascore     32065 non-null  object 
 10  release_year  32135 non-null  float64
 11  free to play  32135 non-null  object 
 12  real_price    32135 non-null  float64
dtypes: bool(1), float64(3), object(9)
memory usage: 3.0+ MB


In [95]:
# Seleccionar las columnas deseadas
columnas_deseadas = ["release_year", "genres", "metascore", "real_price", "early_access"]

# Crear un nuevo DataFrame con las columnas deseadas
data_predic = data[columnas_deseadas].copy()


In [96]:
# Vereificamos
data_predic.tail()

Unnamed: 0,release_year,genres,metascore,real_price,early_access
32130,2018.0,"Casual, Indie, Simulation, Strategy",NO,1.99,False
32131,2018.0,"Casual, Indie, Strategy",NO,4.99,False
32132,2018.0,"Indie, Racing, Simulation",NO,1.99,False
32133,2017.0,"Casual, Indie",NO,4.99,False
32134,0.0,,NO,4.99,True


In [97]:
# Reemplazar los valores "NO" por NaN en la columna "metascore"
data_predic["metascore"] = data_predic["metascore"].replace("NO", pd.NA)

# Convertir la columna "metascore" a tipo numérico
data_predic["metascore"] = pd.to_numeric(data_predic["metascore"], errors='coerce')

# Eliminar filas con valores NaN en la columna "metascore" del DataFrame "df_reduced"
data_predic.dropna(subset=["metascore"], inplace=True)

In [98]:
data_predic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2607 entries, 28 to 32117
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   release_year  2607 non-null   float64
 1   genres        2545 non-null   object 
 2   metascore     2607 non-null   float64
 3   real_price    2607 non-null   float64
 4   early_access  2607 non-null   bool   
dtypes: bool(1), float64(3), object(1)
memory usage: 104.4+ KB


In [99]:
# Crear columnas separadas para cada género
generos_dummies = data_predic["genres"].str.get_dummies(sep=",")

# Concatenar las columnas generadas con el DataFrame original
data_predic = pd.concat([data_predic, generos_dummies], axis=1)

# Eliminar las columnas originales de "release_date" y "genres"
data_predic.drop(["genres"], axis=1, inplace=True)


In [100]:
data_predic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2607 entries, 28 to 32117
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   release_year            2607 non-null   float64
 1   metascore               2607 non-null   float64
 2   real_price              2607 non-null   float64
 3   early_access            2607 non-null   bool   
 4    Action                 2607 non-null   int64  
 5    Adventure              2607 non-null   int64  
 6    Casual                 2607 non-null   int64  
 7    Early Access           2607 non-null   int64  
 8    Free to Play           2607 non-null   int64  
 9    Indie                  2607 non-null   int64  
 10   Massively Multiplayer  2607 non-null   int64  
 11   RPG                    2607 non-null   int64  
 12   Racing                 2607 non-null   int64  
 13   Simulation             2607 non-null   int64  
 14   Sports                 2607 non-null   int

In [102]:
# Supongamos que tienes tus datos en un DataFrame llamado "df"
# Eliminar la columna "price" para utilizarla como variable objetivo para la predicción
X = data_predic.drop(columns=['real_price'])
y = data_predic['real_price']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el modelo de regresión lineal múltiple
model = LinearRegression()


# Entrenar el modelo con los datos de entrenamiento
model.fit(X_train, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test)



In [103]:
# Calcular el MSE
mse = mean_squared_error(y_test, y_pred)

# Calcular el RMSE
rmse = np.sqrt(mse)

# Mostrar el RMSE
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 10.00525350733729


In [104]:
# Guardar el modelo en un archivo pickle
with open('predic_jesu.pkl', 'wb') as file:
    pickle.dump(model, file)

In [105]:
# Calcular las diferencias (residuos)
diferencias = y_test - y_pred

# Crear un DataFrame para visualizar las diferencias junto con las predicciones y los valores reales
resultados = pd.DataFrame({'Precio Real': y_test, 'Predicción': y_pred, 'Diferencia': diferencias})

# Mostrar los ejemplos de 20 a 30
print(resultados.iloc[10:20])

       Precio Real  Predicción  Diferencia
28261        19.99   18.070071    1.919929
119           4.99   17.502365  -12.512365
1885         14.99   14.168782    0.821218
845          19.99   16.951769    3.038231
29210         9.99   17.536007   -7.546007
21308        14.99   17.778254   -2.788254
3475         12.99   13.152840   -0.162840
27988         9.99   29.953774  -19.963774
31306         9.99    9.703079    0.286921
171           9.99   12.973408   -2.983408
