# Model training

In [170]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [171]:
df = pd.read_excel("data/dataset_cleaned.xlsx")
df

In [None]:
df.prcp.isnull().sum()

1282

In [None]:
df.Cantidad_Parque_Vehicular.value_counts()

Cantidad_Parque_Vehicular
621291     1728
419442     1503
569102     1500
449918     1441
487157     1368
           ... 
2695457       1
21213         1
2573494       1
17018         1
3063704       1
Name: count, Length: 61, dtype: int64

In [None]:
# groupby_columns = ['Fe_Ocurrencia','De_Causa_Siniestro', 'Cd_Sexo', 'Relacion_Reclamante', 'In_Lista_Negra',
#                    'Marca', 'Modelo', 'Año', 'Clase',
#                    'Tipo_Dia', 'dia_feriado', 'latitude', #'Cantidad_Parque_Vehicular', 
#                    'longitude', 'PROVINCIA', 'Clima_Clear', 'Clima_Cloudy', 'Clima_Fair',
#                    'Clima_Fog', 'Clima_Rain', 'Clima_Thunderstorm', 'tavg', 'prcp', 'wdir',
#                    'wspd', 'pres', 'month_sin', 'month_cos', 'day_sin', 'day_cos']

# # Get the frequency of occurrences for each group
# frequency_data = df.groupby(groupby_columns).size().reset_index(name='frequency')
# frequency_data

In [None]:
df[['longitude','latitude','tavg','dia_feriado','Clima_Fog','Clima_Thunderstorm','pres','prcp']].dtypes

longitude             float64
latitude              float64
tavg                  float64
dia_feriado             int64
Clima_Fog               int64
Clima_Thunderstorm      int64
pres                  float64
prcp                   object
dtype: object

In [None]:
# Group in a weekly format
groupby_columns = ['PROVINCIA','ano','month','week','De_Causa_Siniestro',
                   'Marca', 'Modelo', 'Año', 'Clase']

def mode(x):
    '''Get the mode of the specified column.'''
    return stats.mode(x)[0]

tranformations = {  'latitude':'median',
                    'longitude':'median',
                    'tavg':'median',
                    'prcp':'median',
                    'wdir':'median',
                    'wspd':'median',
                    'pres':'median',
                    'dia_feriado':mode,
                    'Clima_Clear':mode,
                    'Clima_Cloudy':mode,
                    'Clima_Fair':mode,
                    'Clima_Fog':mode,
                    'Clima_Rain':mode,
                    'Clima_Thunderstorm':mode
}

# Get the frequency of occurrences for each group
frequency_data = df.groupby(groupby_columns).agg(tranformations).reset_index()
frequency_data

In [None]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

frequency_data = encode(frequency_data, 'month', 12)
frequency_data = encode(frequency_data, 'week', 5)

In [151]:
frequency_data.De_Causa_Siniestro.value_counts()

De_Causa_Siniestro
Accidente/Choque                            7083
Rotura de Vidrios y/o Parabrisas            2235
Responsabilidad Civil Personas o Cosas      1708
Robo o Hurto Accesorios/ Partes o Piezas     329
No Registrado                                196
Daños Maliciosos al Vehículo/APOV             95
Hechos de la Naturaleza                       50
Robo o Hurto  Vehiculo                        21
Name: count, dtype: int64

In [141]:
frequency_data.dtypes

De_Causa_Siniestro     object
Tipo_Dia               object
dia_feriado             int64
PROVINCIA              object
Clima_Clear             int64
Clima_Cloudy            int64
Clima_Fair              int64
Clima_Fog               int64
Clima_Rain              int64
Clima_Thunderstorm      int64
tavg                  float64
prcp                   object
wdir                  float64
wspd                  float64
pres                  float64
month_sin             float64
month_cos             float64
day_sin               float64
day_cos               float64
ano                     int64
month                   int64
day                     int64
frequency               int64
dtype: object

### LabelEncoder for categorical variables and test train split

In [None]:
scaler = StandardScaler()
scaler.fit(X)
x = scaler.transform(X)
x.shape

In [146]:
# Split the data into training and test sets
X = frequency_data.drop('frequency', axis=1)
y = frequency_data['frequency']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

# Define the numerical and categorical columns
num_columns = X.select_dtypes(include=['float64', 'int64']).columns
cat_columns = X.select_dtypes(include=['object']).columns

# Define the numerical and categorical transformers
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

for col in X_train.columns:
    if X_train[col].dtype == object:
        X_train[col] = X_train[col].astype(str)

mixed_type_columns = [col for col in X_train.columns if len(X_train[col].apply(type).value_counts()) > 1]
col_transformer = make_column_transformer(
    (num_transformer, num_columns),
    (cat_transformer, cat_columns),
    remainder='passthrough'
)

# Fit and transform the training data
X_train_processed = col_transformer.fit_transform(X_train)

# Transform the test data
X_test_processed = col_transformer.transform(X_test)

print(X_train_processed.shape)

(8201, 319)


In [147]:
print(X_train.dtypes)

De_Causa_Siniestro     object
Tipo_Dia               object
dia_feriado             int64
PROVINCIA              object
Clima_Clear             int64
Clima_Cloudy            int64
Clima_Fair              int64
Clima_Fog               int64
Clima_Rain              int64
Clima_Thunderstorm      int64
tavg                  float64
prcp                   object
wdir                  float64
wspd                  float64
pres                  float64
month_sin             float64
month_cos             float64
day_sin               float64
day_cos               float64
ano                     int64
month                   int64
day                     int64
dtype: object


## Training

In [148]:
# Lista de modelos de regresión
models = []
models.append(('LR', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('SVR', SVR()))
models.append(('DecisionTree', DecisionTreeRegressor()))
models.append(('ExtraTrees', ExtraTreesRegressor()))
models.append(('AdaBoost', AdaBoostRegressor()))
models.append(('Bagging', BaggingRegressor()))
models.append(('BayesianRidge', BayesianRidge()))
models.append(('GradientBoosting', GradientBoostingRegressor()))
models.append(('RandomForest', RandomForestRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('XGBoost', XGBRegressor()))
models.append(('LightGBM', LGBMRegressor()))
models.append(('CatBoost', CatBoostRegressor()))

In [149]:
# Guardamos los resultados del modelo
resultados = []
nombres = []
scoring = 'neg_root_mean_squared_error'
# como métrica de puntuación en Scikit-learn, estás expresando tu deseo de maximizar el valor negativo del MSE, lo que equivale a minimizar el MSE real.

for nombre, model in models:
    kfold = KFold(n_splits=10, random_state=None)
    cv_resultados = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    resultados.append(cv_resultados)
    nombres.append(nombre)
    msg = ('{}: {} ({})'.format(nombre, cv_resultados.mean(), cv_resultados.std()))
    print(msg)

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\linear_model\_base.py", line 678, in fit
    X, y = self._validate_data(
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\utils\validation.py", line 1147, in check_X_y
    X = check_array(
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\utils\validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\utils\_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\pandas\core\generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'Daños Maliciosos al Vehículo/APOV'

--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\linear_model\_base.py", line 678, in fit
    X, y = self._validate_data(
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\utils\validation.py", line 1147, in check_X_y
    X = check_array(
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\utils\validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\sklearn\utils\_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "c:\Users\Aneur\Miniconda3\lib\site-packages\pandas\core\generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'Responsabilidad Civil Personas o Cosas'


In [None]:
print(X_train.dtypes)


Fe_Ocurrencia         datetime64[ns]
De_Causa_Siniestro           float64
Tipo_Dia                     float64
dia_feriado                  float64
PROVINCIA                    float64
Clima_Clear                  float64
Clima_Cloudy                 float64
Clima_Fair                   float64
Clima_Fog                    float64
Clima_Rain                   float64
Clima_Thunderstorm           float64
tavg                         float64
prcp                         float64
wdir                         float64
wspd                         float64
pres                         float64
month_sin                    float64
month_cos                    float64
day_sin                      float64
day_cos                      float64
dtype: object


In [None]:
#  Graficamos para tener mejor ilustracion
fig = plt.figure()
fig.suptitle('Comparación de algoritmos')
ax = fig.add_subplot(111)
plt.boxplot(resultados)
ax.set_xticklabels(nombres)
plt.show()

Lime, Shap values, impulso-respuesta