In [None]:
import os 
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, lasso_path, lars_path, enet_path
%run script.py

In [None]:
data = pd.read_csv('./data/train.csv')
data.head()

In [None]:
y = data['critical_temp']
X = data[data.columns[['wtd' in column for column in data.columns]]]

In [None]:
# data.isnull().sum()
# data.info()
# data.describe()

In [None]:
cor_mat = X.corr().abs()
plt.matshow(cor_mat)
plt.show()

In [None]:
upper_tri = cor_mat.where(np.triu(np.ones(cor_mat.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]
X1 = X.drop(to_drop , axis = 1)

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]

vif_data = vif_data.sort_values(by = ['VIF'])
vif_data

In [None]:
top_19 = vif_data.feature[:19]
sum([x in to_drop for x in list(top_19)])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.2, shuffle = False)
print(f'X_train: {X_train.shape}\ny_train: {y_train.shape}')
print(f'X_test: {X_test.shape}\ny_test: {y_test.shape}')

In [None]:
mlr_model = LinearRegression().fit(X_train, y_train)
mlr_model.coef_

In [None]:
# y_pred = mlr_model.predict(X_test)
# rss = ((y_test - y_pred)**2).sum();rss
# tss = ((y_test - y_test.mean())**2).sum();tss
# ((tss-rss)/tss)*100

In [None]:
mlr_model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)
rss = ((y_test - y_pred)**2).sum();rss
tss = ((y_test - y_test.mean())**2).sum();tss
((tss-rss)/tss)*100

In [None]:
lasso_model = Lasso(alpha=0.9)
lasso_model.fit(X_train, y_train)
lasso_model.score(X_test, y_test)

In [None]:
ridge_model = Ridge(alpha = 0.8)
ridge_model.fit(X_train, y_train)
ridge_model.score(X_test, y_test)

In [None]:
np.linspace(1,20,20)

In [None]:
plt.figure(figsize = (16,9))
xx = np.linspace(1,20,20)
plt.plot(xx, mlr_model.coef_, label = 'OLS', color = 'red')
plt.plot(xx, lasso_model.coef_, label = 'Lasso')
plt.plot(xx, ridge_model.coef_, label = 'Ridge', linestyle = 'dashed', color = 'green')
plt.xlabel(''); plt.ylabel('Coefficient'); plt.title('')
plt.legend(loc = 'best')
plt.show()

In [None]:
alpha, coefs, _ = lasso_path(np.array(X_train), np.array(y_train))
coefs = pd.DataFrame(coefs.T, columns = X_train.columns)
xx = np.sum(np.abs(np.array(coefs)), axis=1)
xx /= xx[-1]
plt.figure(figsize = (16,9))
plt.plot(xx, coefs)
plt.legend(X_train.columns, bbox_to_anchor = (1,1))
plt.title("LASSO Path")
plt.axis("tight")
plt.show()

In [None]:
_, _, coefs = lars_path(np.array(X_train), np.array(y_train), method = "lasso", verbose=True)
coefs = pd.DataFrame(coefs.T, columns = X_train.columns)
xx = np.sum(np.abs(np.array(coefs)), axis=1)
xx /= xx[-1]
plt.figure(figsize = (16,9))
plt.plot(xx, coefs)
plt.legend(X_train.columns, bbox_to_anchor = (1,1))
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle="dashed")
plt.xlabel("|coef| / max|coef|")
plt.ylabel("Coefficients")
plt.title("LASSO Path")
plt.axis("tight")
plt.show()

In [None]:
_, _, coefs = lars_path(np.array(X_train), np.array(y_train))
coefs = pd.DataFrame(coefs.T, columns = X_train.columns)
xx = np.sum(np.abs(np.array(coefs)), axis=1)
xx /= xx[-1]
plt.figure(figsize = (16,9))
plt.plot(xx, coefs)
plt.legend(X_train.columns, bbox_to_anchor = (1,1))
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle="dashed")
plt.xlabel("|coef| / max|coef|")
plt.ylabel("Coefficients")
plt.title("LASSO LARs Path")
plt.axis("tight")
plt.show()

In [None]:
print(f"X_train:{X_train.shape}\ny_train:{y_train.shape}")

batch_regressor = Regressor(normalize = True)
batch_regressor.fit(X_train,y_train)

train_score = r2_score(batch_regressor.predict(X_train),y_train)
test_score = r2_score(batch_regressor.predict(X_test),y_test)

print("Train Score:", train_score)
print("Test Score: ",test_score)
batch_regressor.plot()

In [None]:
print(f"X_train:{X_train.shape}\ny_train:{y_train.shape}")

seq_regressor = Regressor(normalize = True)
seq_regressor.seq_fit(np.array(X_train),np.array(y_train))

train_score = r2_score(seq_regressor.predict(X_train),y_train)
test_score = r2_score(seq_regressor.predict(X_test),y_test)

print("Train Score:", train_score)
print("Test Score: ",test_score)
seq_regressor.plot()