In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
import os
os.chdir("D:/Training/Academy/ML(Python)/Datasets")

In [3]:
boston = pd.read_csv("Boston.csv")
X = boston.drop('medv', axis=1)
y = boston['medv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.intercept_, lr.coef_

(38.10422593184809,
 array([-1.15254380e-01,  5.05163648e-02,  2.73670712e-02,  7.30471064e-01,
        -1.89833026e+01,  4.03133215e+00, -4.82896709e-03, -1.54999896e+00,
         3.67470486e-01, -1.66700680e-02, -9.85758737e-01,  7.44544822e-03,
        -4.59169535e-01]))

In [9]:
ridge = Ridge()
ridge.fit(X_train, y_train)
#ridge.intercept_, ridge.coef_
y_pred = ridge.predict(X_test)
mean_absolute_error(y_test, y_pred)

3.071006801570943

Hyper-parameter Tuning

In [11]:
alphas = [0.001, 0.01, 0.1, 1, 1.5, 2.5, 5, 10]
scores = []
for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append([a, mean_absolute_error(y_test, y_pred)])
df_scores = pd.DataFrame(scores, columns=['alpha','score'])
df_scores.sort_values('score')

Unnamed: 0,alpha,score
3,1.0,3.071007
4,1.5,3.073661
5,2.5,3.082563
2,0.1,3.100479
6,5.0,3.102838
1,0.01,3.115621
0,0.001,3.117373
7,10.0,3.124733


#### Housing

In [15]:
housing = pd.read_csv("Housing.csv")
X, y = housing.drop('price', axis=1), housing['price']

In [17]:
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform='pandas')
col_trnf = ColumnTransformer([('OHE',ohe, make_column_selector(dtype_include=object) )],
                             remainder='passthrough',
                             verbose_feature_names_out=False)
col_trnf = col_trnf.set_output(transform='pandas')
X = col_trnf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [18]:
alphas = [0.001, 0.01, 0.1, 1, 1.5, 2.5, 5, 10]
scores = []
for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append([a, mean_absolute_error(y_test, y_pred)])
df_scores = pd.DataFrame(scores, columns=['alpha','score'])
df_scores.sort_values('score')

Unnamed: 0,alpha,score
6,5.0,11540.704019
7,10.0,11547.050792
5,2.5,11561.705728
4,1.5,11574.321775
3,1.0,11581.329165
2,0.1,11598.734394
1,0.01,11600.720198
0,0.001,11600.919547


In [19]:
np.arange(0.001, 15, 0.5)

array([1.0000e-03, 5.0100e-01, 1.0010e+00, 1.5010e+00, 2.0010e+00,
       2.5010e+00, 3.0010e+00, 3.5010e+00, 4.0010e+00, 4.5010e+00,
       5.0010e+00, 5.5010e+00, 6.0010e+00, 6.5010e+00, 7.0010e+00,
       7.5010e+00, 8.0010e+00, 8.5010e+00, 9.0010e+00, 9.5010e+00,
       1.0001e+01, 1.0501e+01, 1.1001e+01, 1.1501e+01, 1.2001e+01,
       1.2501e+01, 1.3001e+01, 1.3501e+01, 1.4001e+01, 1.4501e+01])

In [22]:
np.linspace(0.001, 15, 20)

array([1.00000000e-03, 7.90421053e-01, 1.57984211e+00, 2.36926316e+00,
       3.15868421e+00, 3.94810526e+00, 4.73752632e+00, 5.52694737e+00,
       6.31636842e+00, 7.10578947e+00, 7.89521053e+00, 8.68463158e+00,
       9.47405263e+00, 1.02634737e+01, 1.10528947e+01, 1.18423158e+01,
       1.26317368e+01, 1.34211579e+01, 1.42105789e+01, 1.50000000e+01])

In [25]:
alphas = np.linspace(0.001, 15, 30)
scores = []
for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append([a, mean_absolute_error(y_test, y_pred)])
df_scores = pd.DataFrame(scores, columns=['alpha','score'])
df_scores.sort_values('score')

Unnamed: 0,alpha,score
13,6.72469,11533.141773
14,7.241897,11533.594498
12,6.207483,11534.960073
15,7.759103,11535.836907
11,5.690276,11537.338372
16,8.27631,11538.09068
10,5.173069,11539.838648
17,8.793517,11540.473661
9,4.655862,11542.469585
18,9.310724,11543.069034


#### Exp_Salaries

In [27]:
sals = pd.read_csv("Exp_Salaries.csv")
X, y = sals.drop('Salary', axis=1), sals['Salary']

In [28]:
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform='pandas')
col_trnf = ColumnTransformer([('OHE',ohe, make_column_selector(dtype_include=object) )],
                             remainder='passthrough',
                             verbose_feature_names_out=False)
col_trnf = col_trnf.set_output(transform='pandas')
X = col_trnf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [29]:
alphas = np.linspace(0.001, 15, 30)
scores = []
for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append([a, mean_absolute_error(y_test, y_pred)])
df_scores = pd.DataFrame(scores, columns=['alpha','score'])
df_scores.sort_values('score')

Unnamed: 0,alpha,score
0,0.001,3035.260687
1,0.518207,3060.466741
2,1.035414,3078.726901
3,1.552621,3092.79629
4,2.069828,3104.036051
5,2.587034,3113.225194
6,3.104241,3120.855344
7,3.621448,3132.731545
8,4.138655,3144.43301
9,4.655862,3153.783976


`.get_dummies()`

In [32]:
pd.get_dummies(sals, drop_first=True)

Unnamed: 0,Salary,Years_Previous_Experience,Years Employed,Years_Education,Number_Supervised,Gender_Male,Department_B,Department_C,Department_D
0,32782,1,0,7,0,True,False,False,False
1,32920,3,15,9,4,False,False,False,False
2,29548,6,5,1,0,True,False,False,False
3,39828,6,18,5,5,False,False,False,False
4,31528,11,3,3,6,True,False,False,False
5,38985,7,18,9,5,True,False,False,False
6,41889,16,22,7,7,True,False,False,False
7,38791,4,21,5,9,True,False,False,False
8,28985,1,0,4,4,False,False,False,False
9,24749,2,6,0,1,False,False,False,False
