In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder


In [2]:
# 1. Cargar el CSV
df = pd.read_csv('tesla_coches_combinados.csv')


In [3]:


# Verificar tipo inicial de la columna
print("Tipo inicial de la columna 'Precio (€)':", df['Precio (€)'].dtypes)

# Identificar valores no numéricos
no_numericos = df[~df['Precio (€)'].astype(str).str.replace(',', '').str.replace('.', '').str.isdigit()]
if not no_numericos.empty:
    print("Valores no numéricos detectados en 'Precio (€)':")
    print(no_numericos)

# Limpiar la columna 'Precio (€)'
df['Precio (€)'] = df['Precio (€)'].str.replace(',', '').str.strip()

# Convertir a tipo numérico
df['Precio (€)'] = pd.to_numeric(df['Precio (€)'], errors='coerce')

# Eliminar filas con valores NaN después de la conversión
df = df.dropna(subset=['Precio (€)'])

# Verificar el tipo final de la columna
print("Tipo final de la columna 'Precio (€)':", df['Precio (€)'].dtypes)

# Ver los primeros valores limpios
print(df['Precio (€)'].head())


Tipo inicial de la columna 'Precio (€)': object
Tipo final de la columna 'Precio (€)': int64
0    29100
1    28600
2    28400
3    33500
4    34200
Name: Precio (€), dtype: int64


In [4]:

# Limpieza de la columna 'Precio (€)'
df['Precio (€)'] = df['Precio (€)'].astype(str).str.replace(',', '').str.strip()
df['Precio (€)'] = pd.to_numeric(df['Precio (€)'], errors='coerce')
df = df.dropna(subset=['Precio (€)'])

# Limpieza de la columna 'Kilometraje (kms)'
df['Kilometraje (kms)'] = df['Kilometraje (kms)'].astype(str).str.replace(',', '').str.strip()
df['Kilometraje (kms)'] = pd.to_numeric(df['Kilometraje (kms)'], errors='coerce')
df = df.dropna(subset=['Kilometraje (kms)'])

# Convertir variables categóricas en numéricas
le_modelo = LabelEncoder()
le_color = LabelEncoder()
le_pais = LabelEncoder()

df['Modelo'] = le_modelo.fit_transform(df['Modelo'])
df['Color'] = le_color.fit_transform(df['Color'])
df['País'] = le_pais.fit_transform(df['País'])

# Separar características (X) y variable objetivo (y)
X = df.drop(columns=['Precio (€)'])
y = df['Precio (€)']

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir el modelo base
model = RandomForestRegressor(random_state=42)

# Definir los hiperparámetros a explorar
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Configurar GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Usamos MAE como métrica
    cv=3,  # Validación cruzada con 3 particiones
    verbose=2,
    n_jobs=-1  # Usar todos los núcleos disponibles
)

# Entrenar el modelo con GridSearchCV
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
best_model = grid_search.best_estimator_
print("Mejores hiperparámetros encontrados:", grid_search.best_params_)

# Evaluar el mejor modelo
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE (Error Absoluto Medio): {mae}")
print(f"R2 (Coeficiente de Determinación): {r2}")

# Guardar predicciones
df_test = X_test.copy()
df_test['Precio Real (€)'] = y_test
df_test['Precio Predicho (€)'] = y_pred
df_test.to_csv('tesla_predicciones_grid_rf.csv', index=False)
print("Predicciones guardadas en 'tesla_predicciones_grid_rf.csv'.")


Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n

324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
233 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/pedrohd/miniconda3/envs/iabd_scraping_env/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/pedrohd/miniconda3/envs/iabd_scraping_env/lib/python3.11/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/pedrohd/miniconda3/envs/iabd_scraping_env/lib/python3.11/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Users/pedrohd/miniconda3/envs/iabd_scrap

Mejores hiperparámetros encontrados: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
MAE (Error Absoluto Medio): 2408.7217815251415
R2 (Coeficiente de Determinación): 0.9798104437656016
Predicciones guardadas en 'tesla_predicciones_grid_rf.csv'.
