<a href="https://colab.research.google.com/github/hazrakeruboO/DS-Colabs/blob/main/Copy_of_Python_Programming_Elastic_Net_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font color="green">*To start working on this notebook, or any other notebook that we will use in the Moringa Data Science Course, we will need to save our own copy of it. We can do this by clicking File > Save a Copy in Drive. We will then be able to make edits to our own copy of this notebook.*</font>

# Python Programming: Elastic Net Regression 

## Example

In [None]:
# Example 1
# ---
# Use the fair dataset from the pydataset library to predict marriage satisfaction based on the given variables.
# ---
# 
!pip install pydataset



In [None]:
# Importing our libraries
# 
from pydataset import data
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

ModuleNotFoundError: ignored

In [None]:
# Data preparation
# 
df=pd.DataFrame(data('Fair'))
df.loc[df.sex== 'male', 'sex'] = 0
df.loc[df.sex== 'female','sex'] = 1
df['sex'] = df['sex'].astype(int)
df.loc[df.child== 'no', 'child'] = 0
df.loc[df.child== 'yes','child'] = 1
df['child'] = df['child'].astype(int)
X=df[['religious','age','sex','ym','education','occupation','nbaffairs']]
y=df['rate']

In [None]:
# Creating our linear regression model for the purpose of comparison
# 
regression=LinearRegression()
regression.fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model) 

# This mean standard error score of 1.05 is our benchmark for determining 
# if the elastic net model will be better or worst. 

In [None]:
# Below are the coefficients of this first model. We use a for loop to go through 
# the model and the zip function to combine the two columns.
# 
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

In [None]:
# Elastic Net Model
# Elastic net, just like ridge and lasso regression, requires normalize data. 
# This argument  is set inside the ElasticNet function. 
# The second thing we need to do is create our grid.
# 
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [None]:
# We will now fit our model and display the best parameters and the best results we can get with that setup.
# 
search.fit(X,y)
search.best_params_
abs(search.best_score_)

In [None]:
# The best hyperparameters was an alpha set to 0.001 and a l1_ratio of 0.8. 
# With these settings we got an MSE of 1.08. This is above our baseline model of MSE 1.05  for the baseline model. 
# Which means that elastic net is doing worse than linear regression. 
# For clarity, we will set our hyperparameters to the recommended values and run on the data.
# 
elastic=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.75)
elastic.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print(second_model)

In [None]:
# Below are the coefficients
# 
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

# The coefficients are mostly the same. 
# Notice that occupation was completely removed from the model in the elastic net version. 
# This means that this values was no good to the algorithm. Traditional regression cannot do this.

## Challenges

### <font color="green">Challenge 1</font>

In [None]:
# Challenge 1
# ---
# Question: Using the given housiet, create a regression model to predict 
# the value of prices of a house using the given features. 
# ---
# Dataset url = http://bit.ly/BostonHousingDataset
# ---
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
bostona=pd.read_csv('http://bit.ly/BostonHousingDataset')
bostona.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [None]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [None]:
boston.keys()

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat', 'medv'],
      dtype='object')

In [None]:
boston.dtypes

crim       float64
zn         float64
indus      float64
chas         int64
nox        float64
rm         float64
age        float64
dis        float64
rad          int64
tax          int64
ptratio    float64
b          float64
lstat      float64
medv       float64
dtype: object

In [None]:
boston.feature_names

In [None]:
boston.target

In [None]:
data= boston.data
type(data)

In [None]:
data.shape

In [None]:
data=pd.DataFrame(data=data,columns=boston.feature_names)
data.head()

In [None]:
# added the price column
data['price']=boston.target
data.head()

In [None]:
#understanding the data
data.describe

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
#next step is data visualization
sns.pairplot(data)

<seaborn.axisgrid.PairGrid at 0x7f3818308290>

In [None]:
# to find correlation of features with price,we plot a correlation matrix
#Hi correlation means there a high chance pf predicability with price
# next is plotting a distribution plot
# if features are skewed,its not gonna help us with prediction,hence we need a dist plot for this.
#skewed features will increase complexity of model and reduce accuracy of the model

In [None]:
rows=2
cols=7
fif.ax =plt.subplot(nrows=rows,ncols=cols,figsize=16,4)

col=data.columns
index=0
 for i in range(rows):
   for i in range (cols):
     sns.displot(data=[col[index]],ax=ax[i],[j])
     index=index+1



plt.tight_layout()

In [None]:
# you can remove features which are highly skewed,keep the ones which are normally distributed'
# negative crr shows if x increases y decreases
corrmat=data.corr()
corrmat



In [None]:
fig, ax = plt.subplot(figsize=(18,10))
sns.heatmap(corr,annot=True,annot_kws='size':32)

In [None]:
corrmat.index.values

In [None]:
#creating a feature that is highl orrelated with our target variable which is the price
def getcorrelatedfeature(corrdata,threshold):
  if abs(corrdata[index])> threshold:
    feature.append(index)
    value.append9corrdata[index])
  df=df.DataFrame(data=value,index=feature,columns={'corr value'})
  return df


In [None]:
#selecting features intelligently before fitting into model(if you fit corr values yu will get very low training error but 
#very high testing data)
threshold=0.50
corr_value=getcorrfeature(corrmat['price'],threshold))
corr value

In [None]:
corr_value.index.value

In [None]:
correlated_data=data(corr_value.index)
correlated_data.head()

In [None]:
#PAIRPLOT AND CORRMAT OF DATA
sns.pairbplt(correlated_data)
plt.tight_layout()

In [None]:
# heatmap of corr data
sns.heatmap(correlated_data.corr(),annot=True,annot_kws=('size':12))

In [None]:
X=correlated_data.drop(labels=['price'],axis=1)
Y=correlated_data['price']
X.head

In [None]:
X_train,Xtest,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
X_train.shape,X_test.shape

In [None]:
# training our modell
model=linearRegression()
# fitt our model
model.fit(Xtrain,Y_train)

In [None]:
y_predict=model.predict(X_test)

In [None]:
#compare y_predict,y_test
y_predict,y_test

In [None]:
df=pd.DataFrame(data=[y_predict,y_test])
df.T # transpose answer

In [None]:
#Defining perfomance metrics with the help of r2 r squared
# r2 is low or close to zero thats a bad model,r2 =1,thats a very good model

In [None]:
#regression evaluation mtrix
#MAE,MSE,RMSE

In [None]:
from sklearn.mertric import r2_score
score=r2_score(y_test,y_predict)
mae=mean_absolute_error(y_test,y_predict)
rmse=mean_squared_error(y_test,y_predict)

print('r_score',score)
print('mae':,mae)

### <font color="green">Challenge 2</font>

In [None]:
# Challenge 2
# ---
# Question: Using the Ames Housing dataset, create a regression model to predict the sales price of home 
# applying elastic net regression.
# ---
# Dataset Source = http://bit.ly/HousePricesDataset
# 
OUR CODE GOES HERE

### <font color="green">Challenge 3</font>

In [None]:
# Challenge 3
# ---
# Question: Given the medical cost personal dataset, accurately predict insurance cost using a regression model.
# ---
# Dataset Source = http://bit.ly/https://bit.ly/insurance-_dataset
# 
OUR CODE GOES HERE

### <font color="green">Challenge 4</font>

In [None]:
# Challenge 4
# ---
# Question: Use ElasticNet regression to build a model that is able to accurately predict the profits of a startup.
# ---
# Dataset Source = http://bit.ly/StartupsDataset
# ---
# 
OUR CODE STARTS HERE

### <font color="green">Challenge 5</font>

In [None]:
# Challenge 5
# ---
# Question: Build a prediction model to predict duration for any combination of country,operator, 
# services and category given the genre,language and number of units. 
# Apply ElasticNet regression while building your model. 
# ---
# Dataset Source = https://bit.ly/Audio_content_consumption
# ---
# 
OUR CODE STARTS HERE