In [2]:
# Data Preparation and Defining X and y
import pandas as pd
from datetime import datetime
import category_encoders as ce
import numpy as np
import matplotlib.pyplot as plt

# Load the CSV file into a DataFrame
df = pd.read_csv('clean_cochesdotcom_detail_pages_2024-05-13.csv')

# Convert 'Año' to datetime, specifying the format to correctly parse month and year
df['Año'] = pd.to_datetime(df['Año'], format='%m/%Y')

# Calculate 'Age' in days from 'Año' to the current date
df['Age'] = (datetime.now() - df['Año']).dt.days

# Drop the 'Año' column after calculating 'Age'
df = df.drop(['Año'], axis=1)

# Imputing missing values based on 'make', 'model', 'Cambio', 'Potencia'
for column in df.columns:
    if df[column].dtype == 'float64' or df[column].dtype == 'int64':
        # Impute numeric columns with the mean of the group
        df[column] = df.groupby(['make', 'model', 'Cambio', 'Potencia (cv)'])[column].transform(lambda x: x.fillna(x.mean()))
    elif df[column].dtype == 'bool':
        # Impute boolean columns with the mode of the group, or "Unknown" if mode is not available
        df[column] = df.groupby(['make', 'model', 'Cambio', 'Potencia (cv)'])[column].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else "Unknown"))

# Drop rows that still have missing values after the imputation
df = df.dropna()

# One-hot encoding, filling NaNs, etc. (Ensure 'cash' is not affected)
categorical_cols = ['Combustible', 'Cambio', 'Vendedor', 'Transmisión', 'Tracción', 'Carrocería']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

target = 'cash'
high_cardinality_features = ['make', 'model', 'Color']
encoder = ce.TargetEncoder(cols=high_cardinality_features)
df_encoded = encoder.fit_transform(df[high_cardinality_features], df[target])
df.drop(high_cardinality_features, axis=1, inplace=True)
df = pd.concat([df, df_encoded], axis=1)

# Preparing features and target
X = df.drop('cash', axis=1)
y = df['cash']


In [3]:
#Define Full Preprocessor with PCA and Standardization
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Pipeline setup
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("Numerical columns:", numerical_cols)

# Define transformers and pipeline
numerical_transformer = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols)
], remainder='passthrough')


Numerical columns: ['Kms (km)', 'Potencia (cv)', 'Maletero (L)', 'Anchura (cm)', 'Altura (cm)', 'Longitud (cm)', 'Puertas (puertas)', 'Plazas (plazas)', 'Depósito (L)', 'Peso máximo (kg)', 'Velocidad máxima (km/h)', 'Consumo mixto (L)', 'Consumo urbano (L)', 'Consumo extraurbano (L)', '0-100 km/h (s)', 'Autonomía', 'Emisiones de CO2 (gr/m3)', 'Cilindrada (cm3)', 'Marchas', 'Par máximo (nm)', 'Age', 'make', 'model', 'Color']


In [None]:
#Define Preprocessor Without PCA --> Skipping this one since the top performer is with PCA and StandardScaler (same for rest of prep.)
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Pipeline setup
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("Numerical columns:", numerical_cols)

#Define Preprocessor Without PCA
preprocessor_no_pca = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols)
], remainder='passthrough')


In [None]:
# Define preprocessor without Standardization
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

# Pipeline setup
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("Numerical columns:", numerical_cols)

# Define preprocessor without Standardization
preprocessor_no_standardization = ColumnTransformer(transformers=[
    ('num', PCA(n_components=0.95), numerical_cols)
], remainder='passthrough')


In [None]:
# Define preprocessor without both PCA and Standardization
preprocessor_no_both = 'passthrough'  # This uses no preprocessing on numerical columns

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import KFold

# Define a consistent KFold splitter
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_with_cv(preprocessor, X, y, params, cv=kf):
    model = RandomForestRegressor(**params)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    # Using cross_val_score for consistent evaluation
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2')
    mse_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
    return np.mean(scores), np.mean(-mse_scores)  # Convert MSE to positive

In [5]:
# RandomForest parameters
rf_params = {'n_estimators': 200, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 20}


In [6]:
r2_full, mse_full = evaluate_with_cv(preprocessor, X, y, rf_params)
print(f"Full Preprocessing - R2: {r2_full}, MSE: {-mse_full}")  # neg_mean_squared_error is negative


Full Preprocessing - R2: 0.7113412536358774, MSE: -113236066.80684654


In [None]:
#How would it look for rest of preprocessors
r2_no_pca, mse_no_pca = evaluate_with_cv(preprocessor_no_pca, X, y, rf_params)
print(f"No PCA - R2: {r2_no_pca}, MSE: {mse_no_pca}")
r2_no_standardization, mse_no_standardization = evaluate_with_cv(preprocessor_no_standardization, X, y, rf_params)
print(f"No Standardization - R2: {r2_no_standardization}, MSE: {mse_no_standardization}")
r2_no_both, mse_no_both = evaluate_with_cv(preprocessor_no_both, X, y, rf_params)
print(f"No PCA & No Standardization - R2: {r2_no_both}, MSE: {mse_no_both}")


In [None]:
#Tests for understanding PCA impact --> ran in ML_cochesdotcom_first.
import pandas as pd 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load the CSV file into a DataFrame
df = pd.read_csv('clean_cochesdotcom_detail_pages_2024-05-13.csv')

if 'cash' in df.columns:

    # Convert 'Año' to datetime, specifying the format to correctly parse month and year
    df['Año'] = pd.to_datetime(df['Año'], format='%m/%Y')
    
    # Calculate 'Age' in days from 'Año' to the current date
    df['Age'] = (datetime.now() - df['Año']).dt.days
    
    # Drop the 'Año' column after calculating 'Age'
    df = df.drop(['Año'], axis=1)

    # One-hot encoding, filling NaNs, etc. (Ensure 'cash' is not affected)
    categorical_cols = ['Combustible', 'Cambio', 'Vendedor', 'Transmisión', 'Tracción', 'Carrocería']
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    target = 'cash'
    high_cardinality_features = ['make', 'model', 'Color']
    encoder = ce.TargetEncoder(cols=high_cardinality_features)
    df_encoded = encoder.fit_transform(df[high_cardinality_features], df[target])
    df.drop(high_cardinality_features, axis=1, inplace=True)
    df = pd.concat([df, df_encoded], axis=1)

    # Preparing features and target
    X = df.drop('cash', axis=1)
    y = df['cash']
    
    # Assuming 'X' is your feature matrix before applying the full pipeline
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    pca = PCA()
    X_pca = pca.fit_transform(X_scaled)
    
    # Plot the cumulative variance explained by all the components
    plt.figure(figsize=(10, 7))
    plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('Explained Variance by PCA Components')
    plt.grid(True)
    plt.show()

    # Get the loadings of the first few components
    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
    
    # Convert loadings into a DataFrame for better readability
    loading_matrix = pd.DataFrame(loadings, columns=[f'PC{i+1}' for i in range(len(pca.components_))], index=X.columns)
    print(loading_matrix)

    plt.figure(figsize=(10, 7))
    plt.plot(range(1, len(pca.explained_variance_) + 1), pca.explained_variance_, 'o-')
    plt.xlabel('Component Number')
    plt.ylabel('Eigenvalue (Explained Variance)')
    plt.title('Scree Plot')
    plt.grid(True)
    plt.show()



In [None]:
#Applying the different models and comparing performance (Random Forest and XGBoost)
import time
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, cross_validate

# Models setup
models = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(objective='reg:squarederror', random_state=42)
}

# Evaluation function
def evaluate_models(models, X, y):
    results = []
    scoring = {'MSE': make_scorer(mean_squared_error, greater_is_better=False),
               'R2': 'r2'}
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for name, model in models.items():
        start_time = time.time()
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        scores = cross_validate(pipeline, X, y, cv=kf, scoring=scoring, return_train_score=False)
        duration = time.time() - start_time
        results.append({
            'Model': name,
            'MSE': -np.mean(scores['test_MSE']),  # Negate to make positive
            'R2': np.mean(scores['test_R2']),
            'Training Time (s)': duration
        })
    
    return pd.DataFrame(results)

# Run evaluation
results_df = evaluate_models(models, X, y)
print(results_df)

# Recommendation based on MSE and R2
best_model = results_df.sort_values(by=['MSE', 'R2'], ascending=[True, False]).iloc[0]
print(f"Recommended model based on MSE and R²: {best_model['Model']}")


In [None]:
#Applying the different models and comparing performance (NN)
import time
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, cross_validate

# Models setup
models = {
    'MLPRegressor': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1500, alpha=0.001, learning_rate_init=0.005, solver='adam', random_state=42)
}

# Evaluation function
def evaluate_models(models, X, y):
    results = []
    scoring = {'MSE': make_scorer(mean_squared_error, greater_is_better=False),
               'R2': 'r2'}
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for name, model in models.items():
        start_time = time.time()
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        scores = cross_validate(pipeline, X, y, cv=kf, scoring=scoring, return_train_score=False)
        duration = time.time() - start_time
        results.append({
            'Model': name,
            'MSE': -np.mean(scores['test_MSE']),  # Negate to make positive
            'R2': np.mean(scores['test_R2']),
            'Training Time (s)': duration
        })
    
    return pd.DataFrame(results)

# Run evaluation
results_df = evaluate_models(models, X, y)
print(results_df)

# Recommendation based on MSE and R2
best_model = results_df.sort_values(by=['MSE', 'R2'], ascending=[True, False]).iloc[0]
print(f"Recommended model based on MSE and R²: {best_model['Model']}")


In [None]:
#Hyperparametritation optimization with Optuna library --> Best is trial 11 with value: 0.4925261281110004.{'hidden_layer_sizes': '100', 'max_iter': 1500, 'alpha': 0.00027907050619925316, 'learning_rate_init': 0.0038619320536359453}

from sklearn.neural_network import MLPRegressor
import optuna
from sklearn.model_selection import train_test_split

# Assuming 'X' and 'y' are your features and target data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    hidden_layer_config = trial.suggest_categorical('hidden_layer_sizes', ['50', '100', '50-50', '100-50'])
    hidden_layer_sizes = tuple(map(int, hidden_layer_config.split('-'))) if '-' in hidden_layer_config else (int(hidden_layer_config),)

    params = {
        'hidden_layer_sizes': hidden_layer_sizes,
        'max_iter': trial.suggest_categorical('max_iter', [1000, 1500, 2000]),
        'alpha': trial.suggest_float('alpha', 1e-4, 1e-2, log=True),
        'learning_rate_init': trial.suggest_float('learning_rate_init', 0.001, 0.1, log=True)
    }
    mlp = MLPRegressor(**params, random_state=42)
    mlp.fit(X_train, y_train)
    return mlp.score(X_val, y_val)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print(study.best_params)



In [None]:
# MLPRegressor with different activation functions (default is relu)
from sklearn.neural_network import MLPRegressor
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Assuming 'X' and 'y' are your features and target data already defined
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Suggesting hidden layer configurations as strings
    hidden_layer_config = trial.suggest_categorical('hidden_layer_sizes', ['50', '100', '50-50', '100-50'])
    # Converting string representation to tuple of integers
    hidden_layer_sizes = tuple(map(int, hidden_layer_config.split('-'))) if '-' in hidden_layer_config else (int(hidden_layer_config),)

    params = {
        'hidden_layer_sizes': hidden_layer_sizes,
        'max_iter': trial.suggest_categorical('max_iter', [1000, 1500, 2000]),
        'alpha': trial.suggest_float('alpha', 1e-4, 1e-2, log=True),
        'learning_rate_init': trial.suggest_float('learning_rate_init', 0.001, 0.1, log=True),
        #excluding default one ('relu')
        'activation': trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh'])
    }
    mlp = MLPRegressor(**params, random_state=42)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_val)
    score = r2_score(y_val, y_pred)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Best trial:', study.best_trial.value)
print('Best parameters:', study.best_params)


In [None]:
#Hyperparametrization for RandomForest and XGBoost --> This piece of code is which show me which was the best configuration:           
#Model  Best Score (R2)  \
#0  RandomForest         0.699499   
#1       XGBoost         0.664087   

#                                     Best Parameters  Training Time (s)  
#0  {'n_estimators': 200, 'min_samples_split': 5, ...         238.308649  
#1  {'subsample': 0.8, 'n_estimators': 300, 'max_d...           7.115866  
#Recommended model based on R²: RandomForest with parameters {'n_estimators': 200, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 20}

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
import time

# Model setup
model_params = {
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10]
        }
    },
    'XGBoost': {
        'model': XGBRegressor(objective='reg:squarederror', random_state=42),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7],
            'subsample': [0.7, 0.8, 0.9]
        }
    }
}

def evaluate_models(model_params, X, y):
    results = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for name, model_info in model_params.items():
        start_time = time.time()
        model_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', RandomizedSearchCV(model_info['model'], model_info['params'], n_iter=10, cv=kf, scoring='r2', n_jobs=-1, random_state=42))
        ])
        
        # Fit and evaluate the pipeline
        model_pipeline.fit(X, y)
        best_model = model_pipeline.named_steps['model'].best_estimator_
        best_score = model_pipeline.named_steps['model'].best_score_
        duration = time.time() - start_time
        
        results.append({
            'Model': name,
            'Best Score (R2)': best_score,
            'Best Parameters': model_pipeline.named_steps['model'].best_params_,
            'Training Time (s)': duration
        })
    
    return pd.DataFrame(results)
    
# Assuming 'X' and 'y' are your features and target data already loaded
results_df = evaluate_models(model_params, X, y)
print(results_df)

# To decide which model to use:
best_model = results_df.loc[results_df['Best Score (R2)'].idxmax()]
print(f"Recommended model based on R²: {best_model['Model']} with parameters {best_model['Best Parameters']}")


In [7]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [10]:
import joblib

# Assuming you have already created and trained 'pipeline' with your RandomForestRegressor
# Save the model to a file
if 'pipeline' in locals() or 'pipeline' in globals():
    joblib.dump(pipeline, 'model_rf.joblib')
    print("Pipeline saved successfully.")
else:
    print("Pipeline not defined.")


Pipeline not defined.


In [19]:
#creating JSON files for website dropdowns based on make and model
import pandas as pd
import json
import numpy as np

# Load the dataset
df = pd.read_csv('clean_cochesdotcom_detail_pages_2024-05-13.csv')

# Helper function to convert data types for JSON serialization
def convert_for_json(item):
    if isinstance(item, np.ndarray):
        return item.tolist()  # Convert numpy arrays to Python list
    elif isinstance(item, (np.int64, np.int32)):
        return int(item)  # Convert numpy integers to Python int
    elif isinstance(item, (np.float64, np.float32)):
        return float(item)  # Convert numpy floats to Python float
    return item  # Return item as is for other types

# Create dictionaries with converted types
makes = [convert_for_json(x) for x in df['make'].unique()]
models_by_make = {convert_for_json(k): [convert_for_json(i) for i in v] for k, v in df.groupby('make')['model'].unique().to_dict().items()}
combustible_by_make_model = {convert_for_json(k[0] + '_' + k[1]): [convert_for_json(i) for i in v] for k, v in df.groupby(['make', 'model'])['Combustible'].unique().to_dict().items()}

cambio_by_make_model = {convert_for_json(k[0] + '_' + k[1]): [convert_for_json(i) for i in v] for k, v in df.groupby(['make', 'model'])['Cambio'].unique().to_dict().items()}
plazas_by_make_model = {convert_for_json(k[0] + '_' + k[1]): [convert_for_json(i) for i in v] for k, v in df.groupby(['make', 'model'])['Plazas (plazas)'].unique().to_dict().items()}
potencia_by_make_model = {convert_for_json(k[0] + '_' + k[1]): [convert_for_json(i) for i in v] for k, v in df.groupby(['make', 'model'])['Potencia (cv)'].unique().to_dict().items()}

# Write to JSON files
with open('makes.json', 'w') as f:
    json.dump(makes, f)

with open('models_by_make.json', 'w') as f:
    json.dump(models_by_make, f)

with open('cambio_by_make_model.json', 'w') as f:
    json.dump(cambio_by_make_model, f)

with open('combustible_by_make_model.json', 'w') as f:
    json.dump(combustible_by_make_model, f)

with open('plazas_by_make_model.json', 'w') as f:
    json.dump(plazas_by_make_model, f)
    
with open('potencia_by_make_model.json', 'w') as f:
    json.dump(potencia_by_make_model, f)


In [2]:
#creating JSON files for website dropdowns with more connections 
import pandas as pd
import json
import numpy as np

# Load the dataset
df = pd.read_csv('clean_cochesdotcom_detail_pages_2024-05-13.csv')

# Helper function to convert data types for JSON serialization
def convert_for_json(item):
    if isinstance(item, np.ndarray):
        return item.tolist()  # Convert numpy arrays to Python list
    elif isinstance(item, (np.int64, np.int32)):
        return int(item)  # Convert numpy integers to Python int
    elif isinstance(item, (np.float64, np.float32)):
        return float(item)  # Convert numpy floats to Python float
    return item  # Return item as is for other types

# Extract unique colors
fuel = df['Combustible'].unique().tolist()
fuel = [convert_for_json(color) for color in fuel]

# Extract unique cambio
transmission = df['Cambio'].unique().tolist()
transmission = [convert_for_json(transmission) for color in transmission]

# Write fuel to JSON file
with open('fuel.json', 'w') as f:
    json.dump(fuel, f)

In [21]:
#creating JSON files for website dropdowns with more connections 
import pandas as pd
import json
import numpy as np

# Load the dataset
df = pd.read_csv('clean_cochesdotcom_detail_pages_2024-05-13.csv')

# Helper function to convert data types for JSON serialization
def convert_for_json(item):
    if isinstance(item, np.ndarray):
        return item.tolist()  # Convert numpy arrays to Python list
    elif isinstance(item, (np.int64, np.int32)):
        return int(item)  # Convert numpy integers to Python int
    elif isinstance(item, (np.float64, np.float32)):
        return float(item)  # Convert numpy floats to Python float
    return item  # Return item as is for other types

# Extract unique colors
colors = df['Color'].unique().tolist()
colors = [convert_for_json(color) for color in colors]

# Create dictionaries with converted types
makes = [convert_for_json(x) for x in df['make'].unique()]
models_by_make = {convert_for_json(k): [convert_for_json(i) for i in v] for k, v in df.groupby('make')['model'].unique().to_dict().items()}
combustible_by_make_model = {convert_for_json(k[0] + '_' + k[1]): [convert_for_json(i) for i in v] for k, v in df.groupby(['make', 'model'])['Combustible'].unique().to_dict().items()}

# Group by make, model and Combustible to get unique Cambio and Plazas (plazas)
cambio_by_make_model_combustible = {convert_for_json(k[0] + '_' + k[1] + '_' + k[2]): [convert_for_json(i) for i in v] for k, v in df.groupby(['make', 'model', 'Combustible'])['Cambio'].unique().to_dict().items()}
plazas_by_make_model_combustible = {convert_for_json(k[0] + '_' + k[1] + '_' + k[2]): [convert_for_json(i) for i in v] for k, v in df.groupby(['make', 'model', 'Combustible'])['Plazas (plazas)'].unique().to_dict().items()}

# Group by make, model, Combustible, Plazas (plazas), and Cambio to get unique Potencia (cv)
potencia_by_make_model_combustible_plazas_cambio = {convert_for_json('_'.join(map(str, k))): [convert_for_json(i) for i in v] for k, v in df.groupby(['make', 'model', 'Combustible', 'Plazas (plazas)', 'Cambio'])['Potencia (cv)'].unique().to_dict().items()}

# Group by required attributes and count occurrences
grouping_cols = ['make', 'model', 'Combustible', 'Plazas (plazas)', 'Potencia (cv)', 'Cambio']
attributes = df.columns.difference(['make', 'model', 'Combustible', 'Plazas (plazas)', 'Potencia (cv)', 'Cambio', 'cash', 'Kms', 'Año', 'Color'])
attribute_dict = {}

for attr in attributes:
    group = df.groupby(grouping_cols)[attr].apply(lambda x: x.mode()[0] if not x.empty else np.nan)
    count = df.groupby(grouping_cols)[attr].size()
    combined = group.reset_index().merge(count.reset_index(), on=grouping_cols)
    combined.columns = grouping_cols + [f'{attr}_by_make_model_combustible_plazas_cambio_potencia', 'count']
    attribute_dict[f'{attr}_by_make_model_combustible_plazas_cambio_potencia'] = combined.set_index(grouping_cols).to_dict(orient='index')


# Write to JSON files
with open('makes.json', 'w') as f:
    json.dump(makes, f)

with open('models_by_make.json', 'w') as f:
    json.dump(models_by_make, f)

with open('cambio_by_make_model_combustible.json', 'w') as f:
    json.dump(cambio_by_make_model_combustible, f)

with open('combustible_by_make_model.json', 'w') as f:
    json.dump(combustible_by_make_model, f)

with open('plazas_by_make_model_combustible.json', 'w') as f:
    json.dump(plazas_by_make_model_combustible, f)

with open('potencia_by_make_model_combustible_plazas_cambio.json', 'w') as f:
    json.dump(potencia_by_make_model_combustible_plazas_cambio, f)

# Write colors to JSON file
with open('colors.json', 'w') as f:
    json.dump(colors, f)

# Write each attributes dictionary to JSON file
for attr, data in attribute_dict.items():
    with open(f'{attr}.json', 'w') as f:
        json.dump(data, f)

FileNotFoundError: [Errno 2] No such file or directory: '0-100 km/h (s)_by_make_model_combustible_plazas_cambio_potencia.json'

In [25]:
import pandas as pd
import json
import numpy as np
import re

# Load the dataset
df = pd.read_csv('clean_cochesdotcom_detail_pages_2024-05-13.csv')

# Helper function to convert data types for JSON serialization
def convert_for_json(item):
    if isinstance(item, np.ndarray):
        return item.tolist()  # Convert numpy arrays to Python list
    elif isinstance(item, (np.int64, np.int32)):
        return int(item)  # Convert numpy integers to Python int
    elif isinstance(item, (np.float64, np.float32)):
        return float(item)  # Convert numpy floats to Python float
    return item  # Return item as is for other types

# Function to sanitize attribute names for file names
def sanitize_filename(name):
    return re.sub(r'[^\w\s-]', '', name).strip().replace(' ', '_').replace('-', '_')

# Define the grouping columns
grouping_cols = ['make', 'model', 'Combustible', 'Plazas (plazas)', 'Cambio', 'Potencia (cv)']
attributes = df.columns.difference(['make', 'model', 'Combustible', 'Plazas (plazas)', 'Cambio', 'Potencia (cv)', 'cash', 'Kms', 'Año', 'color'])
attribute_dict = {}

for attr in attributes:
    # Get the mode for each group
    mode = df.groupby(grouping_cols)[attr].apply(lambda x: x.mode()[0] if not x.empty else np.nan).reset_index(name='mode')

    # Prepare data for JSON serialization
    mode['group_key'] = mode[grouping_cols].apply(lambda x: '_'.join(x.map(str)), axis=1)
    mode_dict = dict(zip(mode['group_key'], mode['mode'].apply(convert_for_json)))

    attribute_dict[sanitize_filename(f'{attr}_by_make_model_combustible_plazas_cambio_potencia')] = mode_dict

# Write each attributes dictionary to JSON file
for attr, data in attribute_dict.items():
    with open(f'{attr}.json', 'w') as f:
        json.dump(data, f)
