In [2]:
# Data Preparation and Defining X and y
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('clean_cochesdotcom_detail_pages_2024-05-13.csv')

# Convert 'Año' to datetime, specifying the format to correctly parse month and year
df['Año'] = pd.to_datetime(df['Año'], format='%m/%Y')

# Calculate 'Age' in days from 'Año' to the current date
df['Age'] = (datetime.now() - df['Año']).dt.days

# Drop the 'Año' column after calculating 'Age'
df = df.drop(['Año'], axis=1)

# Impute missing values and encode categorical variables
for column in df.columns:
    if df[column].dtype == 'float64' or df[column].dtype == 'int64':
        df[column] = df.groupby(['make', 'model', 'Cambio', 'Potencia (cv)'])[column].transform(lambda x: x.fillna(x.mean()))
    elif df[column].dtype == 'bool':
        df[column] = df.groupby(['make', 'model', 'Cambio', 'Potencia (cv)'])[column].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else "Unknown"))

df = df.dropna()

target = 'cash'
categorical_cols = ['make', 'model', 'Combustible', 'Cambio', 'Vendedor', 'Transmisión', 'Tracción', 'Carrocería']
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove(target)

# Preparing features and target
X = df.drop(target, axis=1)
y = df[target]

# Pipeline setup
numerical_transformer = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95))
])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=200, min_samples_split=5, max_features='sqrt', max_depth=20))
])

# Fit the full pipeline to the entire dataset
full_pipeline.fit(X, y)

# Save the fitted pipeline
joblib.dump(full_pipeline, 'model_rf.joblib')
print("Pipeline saved successfully.")


Pipeline saved successfully.


In [2]:
import sklearn
print("Scikit-learn version:", sklearn.__version__)


Scikit-learn version: 1.5.0
