In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge

In [None]:
np.random.seed(20201010)

In [None]:
diabetes = pd.read_csv("/home/silil/Documents/itam/mineria_datos_licenciatura/data/diabetes_india/diabetes.csv")

In [None]:
diabetes.head()

In [None]:
diabetes.rename(columns={col: col.lower() for col in diabetes.columns.values}, inplace=True)

In [None]:
diabetes.describe()

In [None]:
diabetes[diabetes.insulin == 0].shape

In [None]:
diabetes_insulin = diabetes[(diabetes.insulin > 0) & (diabetes.glucose > 0)]

In [None]:
diabetes_insulin.shape

In [None]:
sns.pairplot(diabetes_insulin, diag_kind="hist")

In [None]:
diabetes_insulin.loc[:,('glucose_2')] = np.sqrt(diabetes_insulin.glucose)

In [None]:
X = diabetes_insulin.drop(['glucose','insulin'], axis=1)
y = diabetes_insulin[['insulin']]

In [None]:
from sklearn.model_selection import train_test_split

# train test split
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.3, random_state=111)
print("\nX_train, y_train: ", (X_train.shape, y_train.shape))
print("\nX_test, y_test: ", (X_test.shape, y_test.shape))

### Ridge

In [None]:
from sklearn.linear_model import RidgeCV

In [None]:
rcv = RidgeCV(alphas=[0.001, 0.1, 1.0, 10.0, 100.0], cv=5, scoring="neg_root_mean_squared_error")

In [None]:
m1 = rcv.fit(X_train, y_train.insulin)

In [None]:
m1

In [None]:
m1.alpha_

In [None]:
## RMSE
m1.best_score_

In [None]:
m1.coef_

In [None]:
X.columns.values

In [None]:
feature_importances = pd.DataFrame({'betas': m1.coef_.tolist(), 
                                   'features': X.columns.values})

In [None]:
feature_importances.sort_values(by="betas", ascending=False)

In [None]:
m1.intercept_

**Predicciones**

In [None]:
predictions = m1.predict(X_test)

**Métricas de desempeño**

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
mae = mean_absolute_error(predictions, y_test)
mae

In [None]:
rmse = mean_squared_error(predictions, y_test, squared=False)
rmse

**Residuales**

In [None]:
residuals = y_test.insulin - predictions

In [None]:
residuals_df = pd.DataFrame({'residuals': residuals,
                            'predictions': predictions})

In [None]:
a = sns.scatterplot(x="predictions", y="residuals", data=residuals_df)
a.axes.axhline(0, color="r")

**Q-Q plot**

In [None]:
import scipy.stats as stats
import pylab

In [None]:
stats.probplot(residuals_df.residuals, dist="norm", plot=pylab)
pylab.show()

### Lasso

In [None]:
from sklearn.linear_model import LassoCV

In [None]:
lcv = LassoCV()

In [None]:
m1 = lcv.fit(X_train, y_train.insulin)

In [None]:
m1.alpha_

In [None]:
m1.n_iter_

In [None]:
m1.coef_

In [None]:
feature_importances_lasso = pd.DataFrame({'betas': m1.coef_.tolist(),
                                         'features': X_train.columns.values})

In [None]:
feature_importances_lasso.sort_values(by="betas", ascending=False)

In [None]:
m1.intercept_

**Predicciones**

In [None]:
predictions = m1.predict(X_test)

**Métricas de desempeño**

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
mae = mean_absolute_error(predictions, y_test)
mae

In [None]:
rmse = mean_squared_error(predictions, y_test, squared=False)
rmse

In [None]:
y_test.head()

**Residuales**

In [None]:
residuals = y_test.insulin - predictions

In [None]:
residuals_df = pd.DataFrame({'residuals': residuals,
                            'predictions': predictions})

In [None]:
a = sns.scatterplot(x="predictions", y="residuals", data=residuals_df)
a.axes.axhline(0, color="r")

**Q-Q plot**

In [None]:
stats.probplot(residuals_df.residuals, dist="norm", plot=pylab)
pylab.show()

### ElasticNet

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
ecv = ElasticNetCV()

In [None]:
m1 = ecv.fit(X_train, y_train.insulin)

In [None]:
m1.alpha_

In [None]:
m1.coef_

In [None]:
feature_importances = pd.DataFrame({'betas': m1.coef_.tolist(),
                                   'features': X_train.columns.values})

In [None]:
feature_importances.sort_values(by="betas", ascending=False)

In [None]:
m1.intercept_

**Predicciones**

In [None]:
predictions = m1.predict(X_test)

**Métricas de desempeño**

In [None]:
mae = mean_absolute_error(predictions, y_test.insulin)
mae

In [None]:
rmse = mean_squared_error(predictions, y_test.insulin, squared=False)
rmse

**Residuales**

In [None]:
residuals = y_test.insulin - predictions

In [None]:
residuals_df = pd.DataFrame({'residuals': residuals,
                            'predictions': predictions})

In [None]:
a = sns.scatterplot(x="predictions", y="residuals", data=residuals_df)
a.axes.axhline(0, color="r")

**Q-Q plot**

In [None]:
stats.probplot(residuals_df.residuals, dist="norm", plot=pylab)
pylab.show()