# QUIZ 2 - Ciencia de Datos Aplicada
Juan Felipe Palacios - 201616389

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df_insurance = pd.read_csv('https://raw.githubusercontent.com/jufepalacios/Ciencia_Datos_Aplicada/main/Quiz_2/insurance.csv')

In [3]:
df_insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df_insurance.shape

(1338, 7)

In [5]:
df_insurance = df_insurance.drop_duplicates()
df_insurance.shape

(1337, 7)

In [6]:
df_insurance['sex'].unique()

array(['female', 'male'], dtype=object)

In [7]:
df_insurance['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [8]:
df_insurance['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [9]:
def change_sex(row):
    if row == 'female':
        return 1
    else:
        return 0

def change_smoker(row):
    if row == 'no':
        return 0
    else:
        return 1

In [10]:
df_insurance['sex'] = df_insurance['sex'].apply(change_sex)
df_insurance['smoker'] = df_insurance['smoker'].apply(change_smoker)
df_insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,southwest,16884.924
1,18,0,33.77,1,0,southeast,1725.5523
2,28,0,33.0,3,0,southeast,4449.462
3,33,0,22.705,0,0,northwest,21984.47061
4,32,0,28.88,0,0,northwest,3866.8552


In [11]:
df_regiones = pd.get_dummies(df_insurance['region'])
df_insurance = pd.concat([df_insurance,df_regiones], axis = 1).drop('region',axis=1)

In [12]:
df_insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northeast,northwest,southeast,southwest
0,19,1,27.9,0,1,16884.924,0,0,0,1
1,18,0,33.77,1,0,1725.5523,0,0,1,0
2,28,0,33.0,3,0,4449.462,0,0,1,0
3,33,0,22.705,0,0,21984.47061,0,1,0,0
4,32,0,28.88,0,0,3866.8552,0,1,0,0


In [13]:
X = df_insurance.drop('charges',axis=1)
Y = df_insurance['charges']

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=1)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
lin_reg = LinearRegression()
model = lin_reg.fit(X_train_scaled,Y_train)

print('intercept: {}'.format(model.intercept_))

print('coefficients: {}'.format(model.coef_))

intercept: 13631.418931682883
coefficients: [3579.52257131  260.80993236 2324.12976783  694.23180037 9784.79290277
  296.8140035    92.84819927 -297.93896863  -82.54735287]


In [17]:
preds_train = model.predict(X_train_scaled)
preds_test = model.predict(X_test_scaled)

In [18]:
print(mean_absolute_error(Y_train, preds_train), mean_absolute_error(Y_test, preds_test))
print(np.sqrt(mean_squared_error(Y_train, preds_train)), np.sqrt(mean_squared_error(Y_test, preds_test)))


4362.80036112836 3924.636038021783
6145.32833487281 5689.308291189106


In [19]:
df_insurance['charges'].describe()

count     1337.000000
mean     13279.121487
std      12110.359656
min       1121.873900
25%       4746.344000
50%       9386.161300
75%      16657.717450
max      63770.428010
Name: charges, dtype: float64

Entrene un primer modelo de regresión lineal sin aplicar ningún tipo de regularización. Evalúe dicho modelo y
concluya:

1.   ¿Es aceptable el error obtenido?
2.   ¿Hay evidencia de overfitting?

1. El error MAE y RSEM para el set de datos de prueba dan $3924$ y $5687$ dólares respectivamente. De esto se considera que los errores obtenidos son bajos y es aceptable el modelo obtenido.

2. Como los errores obtenidos para los set de datos de test dan menores que los de entrenamiento, se concluye en este caso que no hay overfitting.

Aplique una transformación polinomial a los datos de entrada y regularización Ridge o Lasso al modelo de
regresión. Pruebe con al menos 2 grados diferentes del polinomio y con al menos 3 valores de alpha para la
regularización. Evalúe dichos modelos y concluya:


In [20]:
poly_features_2 = PolynomialFeatures(degree=2, include_bias=False)
poly_features_3 = PolynomialFeatures(degree=3, include_bias=False)

X_poly_2_train = poly_features_2.fit_transform(X_train_scaled)
X_poly_3_train = poly_features_3.fit_transform(X_train_scaled)

X_poly_2_test = poly_features_2.fit_transform(X_test_scaled)
X_poly_3_test = poly_features_3.fit_transform(X_test_scaled)

In [21]:
alphas = [0.1,0.3,0.5,0.7,0.9]

# Poly 2
for alpha in alphas:
    print('Ridge para un alpha = {}'.format(alpha))
    ridge_reg = Ridge(alpha=alpha, solver='cholesky')
    ridge_reg.fit(X_poly_2_train,Y_train)

    preds_train = ridge_reg.predict(X_poly_2_train)
    preds_test = ridge_reg.predict(X_poly_2_test)

    print('MAE train = {}'.format(mean_absolute_error(Y_train, preds_train)))
    print('MAE test = {}'.format(mean_absolute_error(Y_test, preds_test)))
    print('RMSE train = {}'.format(np.sqrt(mean_squared_error(Y_train, preds_train))))
    print('RMSE test = {}'.format(np.sqrt(mean_squared_error(Y_test, preds_test))))
    print('\n')

Ridge para un alpha = 0.1
MAE train = 2977.3932244331936
MAE test = 2667.7969280340867
RMSE train = 4769.566080355489
RMSE test = 4666.116779306431


Ridge para un alpha = 0.3
MAE train = 2977.5238324126144
MAE test = 2667.919640965315
RMSE train = 4769.56646593429
RMSE test = 4665.9855212762395


Ridge para un alpha = 0.5
MAE train = 2977.65684172988
MAE test = 2668.042415997211
RMSE train = 4769.567236656331
RMSE test = 4665.854738997182


Ridge para un alpha = 0.7
MAE train = 2977.794249522499
MAE test = 2668.1652530269193
RMSE train = 4769.56839206766
RMSE test = 4665.7244319253505


Ridge para un alpha = 0.9
MAE train = 2977.9316551286765
MAE test = 2668.2917010789006
RMSE train = 4769.569931714896
RMSE test = 4665.594599517543




In [22]:
# Poly 3
for alpha in alphas:
    print('Ridge para un alpha = {}'.format(alpha))
    ridge_reg = Ridge(alpha=alpha, solver='cholesky')
    ridge_reg.fit(X_poly_3_train,Y_train)

    preds_train = ridge_reg.predict(X_poly_3_train)
    preds_test = ridge_reg.predict(X_poly_3_test)

    print('MAE train = {}'.format(mean_absolute_error(Y_train, preds_train)))
    print('MAE test = {}'.format(mean_absolute_error(Y_test, preds_test)))
    print('RMSE train = {}'.format(np.sqrt(mean_squared_error(Y_train, preds_train))))
    print('RMSE test = {}'.format(np.sqrt(mean_squared_error(Y_test, preds_test))))
    print('\n')

Ridge para un alpha = 0.1
MAE train = 2865.869896944559
MAE test = 2999.9348123878626
RMSE train = 4560.335138068824
RMSE test = 4932.484059650112


Ridge para un alpha = 0.3
MAE train = 2865.877811929538
MAE test = 2999.7581369064387
RMSE train = 4560.335278015868
RMSE test = 4932.2499576906


Ridge para un alpha = 0.5
MAE train = 2865.885745257692
MAE test = 2999.5815888431384
RMSE train = 4560.335557532631
RMSE test = 4932.016243256458


Ridge para un alpha = 0.7
MAE train = 2865.8936968776375
MAE test = 2999.4051680054345
RMSE train = 4560.335976226767
RMSE test = 4931.782915437062


Ridge para un alpha = 0.9
MAE train = 2865.9016667382184
MAE test = 2999.2288742013347
RMSE train = 4560.336533707375
RMSE test = 4931.549973324832




1. ¿Fue posible mejorar el error? ¿Qué hiper-parámetros tiene el modelo que produce el menor error?
2. ¿Qué atributos parecen ser los más importantes para realizar la predicción?

1. Despues de la transformación y la regularización si se obtuvieron menores errores. Tambien se peude observar que a medida que aumenta el alpha en la regularizacion, loes errores disminuyen para las misma transformacion polinomial.

  Por otra parte, para un mismo valor de alpha, se obtienen menores errores con una transformacion polinomial de grado 2 que de grado 3.

  Finalmente, los hiper-paramerntros del modelo con menos error son: un polinomio de grado 2 y un alpha de 0.9 para la regularizacion.


2. Los atributos que parecen ser los más importantes para realizar la prediccion son: si fuma o no, la edad y el bmi (sucesivamente de mayor a menor)