# <center> Modelamiento LASSO <center>
$$\text{minimizar  }\frac{1}{2 n_{samples}} ||y - Xw||^2_2 + \alpha * ||w||_1$$

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, HTML
# !(pip install chainladder)
import chainladder as cl # librería para realizar triángulos de pérdidas

#cargamos los modelos lineales con LASSO
from sklearn import linear_model
from sklearn.model_selection import train_test_split #separamos train de test
from sklearn.metrics import mean_squared_error #halla# <center> LASSO <center>rmos el mse con los datos de testeo$$\text{minimizar  }\frac{1}{2 n_{samples}} ||y - Xw||^2_2 + \alpha * ||w||_1$$


print("pandas: " + pd.__version__)
print("numpy: " + np.__version__)
print("chainladder: " + cl.__version__)
# print("sklearn.linear_model: " + sklearn.__version__)

pandas: 1.4.2
numpy: 1.22.0
chainladder: 0.8.18


## Ingreso de los Datos

In [2]:
data=pd.read_csv('wkcomp_pos.csv') # cargar o abrir un csv
display(data.head())

Unnamed: 0,GRCODE,GRNAME,AccidentYear,DevelopmentYear,DevelopmentLag,IncurLoss_D,CumPaidLoss_D,BulkLoss_D,EarnedPremDIR_D,EarnedPremCeded_D,EarnedPremNet_D,Single,PostedReserve97_D
0,86,Allstate Ins Co Grp,1988,1988,1,367404,70571,127737,400699,5957,394742,0,281872
1,86,Allstate Ins Co Grp,1988,1989,2,362988,155905,60173,400699,5957,394742,0,281872
2,86,Allstate Ins Co Grp,1988,1990,3,347288,220744,27763,400699,5957,394742,0,281872
3,86,Allstate Ins Co Grp,1988,1991,4,330648,251595,15280,400699,5957,394742,0,281872
4,86,Allstate Ins Co Grp,1988,1992,5,354690,274156,27689,400699,5957,394742,0,281872


Creamos un triángulo con los datos de entrada, para el análisis vamos a tomar únicamente las variables de pérdida

In [3]:
triangle = cl.Triangle(
    data,
    origin="AccidentYear", # filas
    development="DevelopmentYear", # columnas
    columns=data.columns[5:], # tomamos en cuenta solamente las columnas de ganancias y pérdidas
    index=["GRNAME"], # nombres de las compañías
    cumulative=False, # triángulo de pérdidas acumulado o no
)
triangle #objeto de pandas

Unnamed: 0,Triangle Summary
Valuation:,2006-12
Grain:,OYDY
Shape:,"(132, 8, 19, 19)"
Index:,[GRNAME]
Columns:,"[IncurLoss_D, CumPaidLoss_D, BulkLoss_D, EarnedPremDIR_D, EarnedPremCeded_D, EarnedPremNet_D, Single, PostedReserve97_D]"


## Cálculo del triángulo total

Inicialmente, trabajaremos con el triángulo total para observar el comportamiento global de las pérdidas acumuladas.

In [4]:
total_cum=triangle['CumPaidLoss_D'].sum()
print('age-to-age factors:')
ratios=total_cum.link_ratio
display(ratios.heatmap(cmap='Reds'))

age-to-age factors:


Unnamed: 0,12-24,24-36,36-48,48-60,60-72,72-84,84-96,96-108,108-120,120-132,132-144,144-156,156-168,168-180,180-192,192-204,204-216,216-228
1988,2.2342,1.3548,1.1517,1.0883,1.045,1.0323,1.0232,1.027,1.0102,,,,,,,,,
1989,2.2233,1.3404,1.1621,1.083,1.0489,1.0326,1.0333,1.0133,1.0122,,,,,,,,,
1990,2.3662,1.3428,1.1493,1.077,1.0448,1.0403,1.0192,1.0163,1.0099,,,,,,,,,
1991,2.2847,1.3271,1.1482,1.0807,1.057,1.024,1.0229,1.0127,1.0093,,,,,,,,,
1992,2.2783,1.2989,1.1556,1.0929,1.0366,1.0317,1.0211,1.0142,1.0127,,,,,,,,,
1993,2.1713,1.2902,1.1607,1.0667,1.0472,1.0277,1.0186,1.0155,1.0114,,,,,,,,,
1994,2.1375,1.3243,1.1219,1.0709,1.0414,1.0264,1.0207,1.0147,1.0145,,,,,,,,,
1995,2.2353,1.2518,1.1308,1.0701,1.0396,1.0279,1.0168,1.0142,1.0139,,,,,,,,,
1996,1.9294,1.2743,1.1269,1.0702,1.0456,1.0305,1.0173,1.0165,1.0111,,,,,,,,,
1997,2.1487,1.2874,1.1337,1.0711,1.0432,1.0263,1.0209,1.0112,1.0116,,,,,,,,,


## Preparación de los datos para ingresarlos al modelo LASSO

In [17]:
#separamos los datos 
total_cum_list=ratios.values[0][0]#[0]
# total_cum_list=total_cum_list.reshape(-1)
np.shape(total_cum_list)
total_cum.origin

PeriodIndex(['1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
             '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003',
             '2004', '2005', '2006'],
            dtype='period[A-DEC]', name='origin')

In [21]:
#requerimos que los valores de X de entrada de la variable independiente tengan el mismo número de columnas
max_dev=max(total_cum.development)
X=np.array([[i.year,j] for i in total_cum.origin[:-1] for j in total_cum.development[1:]])#range(12,max_dev+12,12)])
print(np.shape(X))

(324, 2)


In [7]:
# (np.where(np.isnan(total_cum_list), total_cum_list, 0))
total_cum_list2=np.where(np.isnan(total_cum_list),0,total_cum_list)
total_cum_list2=total_cum_list2.reshape(-1)#[0]
# X[:19,:]
# X=X.reshape((19,19,2))
#X[:,1])
# for i in range(12,max_dev+12,12):
#     print(i)
print(np.shape(total_cum_list2))
# [x for x in total_cum_list2 if x!=0]
# total_cum.development.loc[2:]
np.shape(X)

(324,)


(324, 2)

In [29]:
X_train, X_test, y_train, y_test=train_test_split(X,total_cum_list2)
np.shape(X_train)
np.shape(y_train)

(243,)

## Implementación del modelo

In [34]:
clf=linear_model.Lasso(alpha=0.1)
clf.fit(X_train,y_train)
# np.shape(total_cum_list2)
print(clf.coef_)
print(clf.intercept_)
clf.predict(X_test)

[-0.04428967 -0.00496492]
89.37049509947867


array([ 0.30978506,  0.31451988, -0.07511094,  0.74370556,  0.98202151,
        0.65512622,  0.20749468,  0.7574166 , -0.20797996,  0.58025791,
        0.87815285,  0.37409887,  0.10204774,  0.27023021,  0.38623164,
       -0.39884969,  0.49167857,  0.41523199,  0.46425649,  1.15918019,
        0.37567714,  0.42894303,  0.72683797,  0.0883367 ,  0.06091462,
        0.28394125,  1.0247329 , -0.04453231,  0.69783761,  0.163205  ,
       -0.02766472,  0.2122295 ,  0.16478328, -0.32555966,  0.67199381,
        0.55125756,  0.53438997,  0.10678257,  1.20346986,  0.68412657,
        0.4473889 ,  0.13262637, -0.23540204,  0.80486282,  0.72999452,
       -0.42942832,  0.86286353, -0.20640168, -0.08882199, -0.19111237,
        0.6398369 ,  0.12049361,  0.44896717,  0.89186389,  0.41996682,
        0.53754652,  0.10362602,  0.45952166,  0.69941589, -0.10095475,
       -0.14524442,  0.24122985, -0.029243  ,  0.02875771,  0.4046775 ,
        0.14633741,  0.98044323, -0.414139  ,  0.28551953, -0.32

In [12]:
# print('age-to-age factors:')
# display(pd.DataFrame(X).heatmap(cmap='Reds'))

array([[1988,   24],
       [1988,   36],
       [1988,   48],
       [1988,   60],
       [1988,   72],
       [1988,   84],
       [1988,   96],
       [1988,  108],
       [1988,  120],
       [1988,  132],
       [1988,  144],
       [1988,  156],
       [1988,  168],
       [1988,  180],
       [1988,  192],
       [1988,  204],
       [1988,  216],
       [1988,  228],
       [1989,   24],
       [1989,   36],
       [1989,   48],
       [1989,   60],
       [1989,   72],
       [1989,   84],
       [1989,   96],
       [1989,  108],
       [1989,  120],
       [1989,  132],
       [1989,  144],
       [1989,  156],
       [1989,  168],
       [1989,  180],
       [1989,  192],
       [1989,  204],
       [1989,  216],
       [1989,  228],
       [1990,   24],
       [1990,   36],
       [1990,   48],
       [1990,   60],
       [1990,   72],
       [1990,   84],
       [1990,   96],
       [1990,  108],
       [1990,  120],
       [1990,  132],
       [1990,  144],
       [1990,

In [61]:
prediction_table=pd.DataFrame(clf.predict(X).reshape(18,18),index=total_cum.origin[:-1],columns=total_cum.development[1:])
prediction_table.style.background_gradient(cmap='Reds')

development,24,36,48,60,72,84,96,108,120,132,144,156,168,180,192,204,216,228
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1988,1.20347,1.143891,1.084312,1.024733,0.965154,0.905575,0.845996,0.786417,0.726838,0.667259,0.60768,0.548101,0.488522,0.428943,0.369364,0.309785,0.250206,0.190627
1989,1.15918,1.099601,1.040022,0.980443,0.920864,0.861285,0.801706,0.742127,0.682548,0.622969,0.56339,0.503811,0.444232,0.384653,0.325074,0.265495,0.205916,0.146337
1990,1.114891,1.055312,0.995733,0.936154,0.876575,0.816996,0.757417,0.697838,0.638259,0.57868,0.519101,0.459522,0.399943,0.340364,0.280785,0.221206,0.161627,0.102048
1991,1.070601,1.011022,0.951443,0.891864,0.832285,0.772706,0.713127,0.653548,0.593969,0.53439,0.474811,0.415232,0.355653,0.296074,0.236495,0.176916,0.117337,0.057758
1992,1.026311,0.966732,0.907153,0.847574,0.787995,0.728416,0.668837,0.609258,0.549679,0.4901,0.430521,0.370942,0.311363,0.251784,0.192205,0.132626,0.073047,0.013468
1993,0.982022,0.922443,0.862864,0.803285,0.743706,0.684127,0.624548,0.564969,0.50539,0.445811,0.386232,0.326653,0.267074,0.207495,0.147916,0.088337,0.028758,-0.030821
1994,0.937732,0.878153,0.818574,0.758995,0.699416,0.639837,0.580258,0.520679,0.4611,0.401521,0.341942,0.282363,0.222784,0.163205,0.103626,0.044047,-0.015532,-0.075111
1995,0.893442,0.833863,0.774284,0.714705,0.655126,0.595547,0.535968,0.476389,0.41681,0.357231,0.297652,0.238073,0.178494,0.118915,0.059336,-0.000243,-0.059822,-0.119401
1996,0.849152,0.789574,0.729995,0.670416,0.610837,0.551258,0.491679,0.4321,0.372521,0.312942,0.253363,0.193784,0.134205,0.074626,0.015047,-0.044532,-0.104111,-0.16369
1997,0.804863,0.745284,0.685705,0.626126,0.566547,0.506968,0.447389,0.38781,0.328231,0.268652,0.209073,0.149494,0.089915,0.030336,-0.029243,-0.088822,-0.148401,-0.20798


In [62]:
y_pred=clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.16472598095591537


Se observa que los valores de los development factores $\lambda$ obtenidos mediante la técnica LASSO aumentan en una dirección en particular (hacia el origen y development iniciales). Esto se debe a que al hacer el modelamiento con LASSO la salida es la de un plano tridimensional de la forma:

$$ax+by+cz=d$$

Siendo $(a,b,c)$ el vector normal, la proyección (a,b) nos indica la dirección de mayor crecimiento de z en el mapa de calor realizado, qque como deciamos antes es hacia la esquina superior izquierda.