Linear regression, Ridge and Lasso

In [92]:
## House Pricing dataset
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
df = fetch_california_housing()


In [20]:
type(df)

sklearn.utils._bunch.Bunch

In [21]:
df

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [22]:
## Convert df into a proper dataframe
dataset = pd.DataFrame(df.data)

In [23]:
## Adding the column names to the dataframe using the 'feature_names' from the original loaded df(note: below are only the independent variables)
dataset.columns = df.feature_names
dataset

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [24]:
## Retrieving the target/dependent variables from the originial loaded, df 'target', and add them into a new column called price into the dataframe
dataset['Price'] = df.target
dataset.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [53]:
# Independent and dependent variables
X = dataset.iloc[:, :-1]  # keep original column names
y = dataset['Price']

In [54]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [None]:

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [95]:
# Splitting the data into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [99]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [100]:
## Linear regression with cross validation
lin_reg = LinearRegression()
neg_mse_scores = cross_val_score(lin_reg, X_train_scaled,y_train,scoring='neg_mean_squared_error',cv = 5)
mean_mse = np.mean(neg_mse_scores)
std_mse = np.std(neg_mse_scores)
print(f"Mean CV MSE: {mean_mse:.4f}")
print(f"STD of CV MSE: {std_mse:.4f}")

Mean CV MSE: -0.5193
STD of CV MSE: 0.0149


In [106]:
# Converting mean mse to rmse to interpret the magnitude of the error

mean_rmse = np.sqrt(-mean_mse)
print(mean_rmse)

0.720600583640735


In [103]:
# Create a summary table that shows corresponding feature coefficients
lin_reg.fit(X_train_scaled,y_train)    # must fit first before accessing the coeffcients or intercepts
summary = pd.DataFrame({
    'Feature': X.columns,  # feature names
    'Coefficient': lin_reg.coef_
})
summary

Unnamed: 0,Feature,Coefficient
0,MedInc,0.854383
1,HouseAge,0.122546
2,AveRooms,-0.29441
3,AveBedrms,0.339259
4,Population,-0.002308
5,AveOccup,-0.040829
6,Latitude,-0.896929
7,Longitude,-0.869842


In [105]:
## Add cross-validated MSE info
summary['CV_Neg_MSE'] = np.mean(neg_mse_scores)  # same mean for all rows
summary['CV_Neg_MSE_STD'] = np.std(neg_mse_scores)  # same std for all rows
summary

Unnamed: 0,Feature,Coefficient,CV_Neg_MSE,CV_Neg_MSE_STD
0,MedInc,0.854383,-0.519265,0.014894
1,HouseAge,0.122546,-0.519265,0.014894
2,AveRooms,-0.29441,-0.519265,0.014894
3,AveBedrms,0.339259,-0.519265,0.014894
4,Population,-0.002308,-0.519265,0.014894
5,AveOccup,-0.040829,-0.519265,0.014894
6,Latitude,-0.896929,-0.519265,0.014894
7,Longitude,-0.869842,-0.519265,0.014894


In [107]:
# Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [111]:
# Ridge Regression with GridSearchCV
ridge = Ridge()
ridge_params = {'alpha':[1e-3, 1e-2, 0.1, 1, 5, 10, 20, 50, 100]}
ridge_grid = GridSearchCV(ridge, ridge_params, scoring='neg_mean_squared_error', cv=15)
ridge_grid.fit(X_train_scaled, y_train)

In [112]:
print(ridge_grid.best_params_)
print(ridge_grid.best_score_)

{'alpha': 1}
-0.5197972422122379


In [116]:
# Create a DataFrame with feature names and their coefficients
ridge = Ridge(alpha=1.0)  # choose alpha
ridge.fit(X_train_scaled, y_train)    # fit the model
coef_table = pd.DataFrame({
    'Feature': X.columns,       # column names
    'Coefficient': ridge.coef_  # fitted coefficients
})
coef_table

Unnamed: 0,Feature,Coefficient
0,MedInc,0.854327
1,HouseAge,0.122624
2,AveRooms,-0.29421
3,AveBedrms,0.339008
4,Population,-0.002282
5,AveOccup,-0.040833
6,Latitude,-0.896168
7,Longitude,-0.869071


Since the ridge regression score of -0.5197 is very very similar to the linear regression score of 0.5193, which means ridge did not really improve as much. So will try Lasso regression next

In [61]:
# Lasso Regression
from sklearn.linear_model import Lasso

In [117]:
# Lasso Regression with GridSearchCV
lasso = Lasso()
lasso_params = {'alpha':[1e-10, 1e-2, 0.1, 1, 5, 10, 20,30,35,40,45,50,55,100]}
lasso_grid = GridSearchCV(lasso, lasso_params, scoring='neg_mean_squared_error', cv=15)
lasso_grid.fit(X_train_scaled, y_train)

In [118]:
print(lasso_grid.best_params_)
print(lasso_grid.best_score_)

{'alpha': 1e-10}
-0.5197976842098386


In [119]:
# Create a DataFrame with feature names and their coefficients
lasso= Lasso(alpha=1e-10)  # choose alpha
lasso.fit(X_train_scaled, y_train)    # fit the model
coef_table = pd.DataFrame({
    'Feature': X.columns,       # column names
    'Coefficient': lasso.coef_  # fitted coefficients
})
coef_table

Unnamed: 0,Feature,Coefficient
0,MedInc,0.854383
1,HouseAge,0.122546
2,AveRooms,-0.29441
3,AveBedrms,0.339259
4,Population,-0.002308
5,AveOccup,-0.040829
6,Latitude,-0.896929
7,Longitude,-0.869842


From all of the above, it looks like that linear regression performs slightly better than ridge and lasso

In [126]:
y_pred = lin_reg.predict(X_test_scaled)

In [124]:
from sklearn.metrics import r2_score

In [127]:
r2_score1 = r2_score(y_pred, y_test)
print(r2_score1)

0.33767016589309884


In [128]:
## Logistic regression
from sklearn.linear_model import LogisticRegression

In [129]:
from sklearn.datasets import load_breast_cancer

In [132]:
df = load_breast_cancer()
type(df)


sklearn.utils._bunch.Bunch

In [138]:
df.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [134]:
### Independent Features
X = pd.DataFrame(df['data'],columns = df['feature_names'])
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [137]:
### Dependent Features

y = pd.DataFrame(df['target'], columns = ["Target"])
y

Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0
...,...
564,0
565,0
566,0
567,0
