In [222]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [223]:
def date_to_semester_fraction(date):
    day, month, year = map(int, date.split('/'))
    days_in_month = [31, 28, 31, 30, 31, 30]
    day_of_year = sum(days_in_month[:month - 1]) + day
    return day_of_year / 182

In [224]:
df = pd.read_csv('household_power_consumption.csv')
df = df.replace('?', np.nan)
df = df.dropna()
df['day_frac'] = df['Time'].apply(lambda x: (int(x.split(':')[0]) * 3600 + int(x.split(':')[1]) * 60 + int(x.split(':')[2])) / 86400)
df['semester_frac'] = df['Date'].apply(date_to_semester_fraction)
df = df.drop(['index', 'Date', 'Time'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 256869 entries, 0 to 260639
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Global_active_power    256869 non-null  object 
 1   Global_reactive_power  256869 non-null  object 
 2   Voltage                256869 non-null  object 
 3   Global_intensity       256869 non-null  object 
 4   Sub_metering_1         256869 non-null  object 
 5   Sub_metering_2         256869 non-null  object 
 6   Sub_metering_3         256869 non-null  float64
 7   day_frac               256869 non-null  float64
 8   semester_frac          256869 non-null  float64
dtypes: float64(3), object(6)
memory usage: 19.6+ MB


In [225]:
for col in df.columns:
    df[col] = df[col].astype(float)
    
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 256869 entries, 0 to 260639
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Global_active_power    256869 non-null  float64
 1   Global_reactive_power  256869 non-null  float64
 2   Voltage                256869 non-null  float64
 3   Global_intensity       256869 non-null  float64
 4   Sub_metering_1         256869 non-null  float64
 5   Sub_metering_2         256869 non-null  float64
 6   Sub_metering_3         256869 non-null  float64
 7   day_frac               256869 non-null  float64
 8   semester_frac          256869 non-null  float64
dtypes: float64(9)
memory usage: 19.6 MB


In [226]:
df['Global_apparent_power'] = np.sqrt(df['Global_active_power'] ** 2 + df['Global_reactive_power'] ** 2)
df.drop(['Global_active_power', 'Global_reactive_power'], axis=1, inplace=True)

In [227]:
df

Unnamed: 0,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,day_frac,semester_frac,Global_apparent_power
0,241.97,10.6,0.0,0.0,0.0,0.000000,0.005495,2.583582
1,241.75,10.4,0.0,0.0,0.0,0.000694,0.005495,2.553958
2,241.64,10.4,0.0,0.0,0.0,0.001389,0.005495,2.551960
3,241.71,10.4,0.0,0.0,0.0,0.002083,0.005495,2.551960
4,241.98,10.4,0.0,0.0,0.0,0.002778,0.005495,2.555957
...,...,...,...,...,...,...,...,...
260635,239.01,12.0,0.0,0.0,18.0,0.996528,0.994505,2.902413
260636,238.86,12.2,0.0,0.0,17.0,0.997222,0.994505,2.914074
260637,239.05,12.0,0.0,0.0,18.0,0.997917,0.994505,2.895570
260638,238.98,11.2,0.0,0.0,18.0,0.998611,0.994505,2.675762


In [228]:
y = df['Global_apparent_power'].to_numpy()
X = df.drop('Global_apparent_power', axis=1).to_numpy()

X.shape, y.shape

((256869, 7), (256869,))

In [229]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True)
kf_scores = pd.DataFrame(columns=['alpha', 'max_iter', 'tol', 'mae', 'n_zeros'])

for alpha in [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0]:
    for max_iter in [100, 1000, 10000]:
        for tol in [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8]:
            scores = []
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model = Lasso(alpha=alpha, max_iter=max_iter, tol=tol)
                model.fit(X_train, y_train)
                preds = model.predict(X_test)
                mae = np.mean(np.abs(preds - y_test))
                scores.append(mae)
                n_zeros = np.sum(model.coef_ == 0).astype(int)
            kf_scores.loc[len(kf_scores)] = [alpha, max_iter, tol, np.mean(scores), n_zeros]

In [230]:
a = kf_scores.sort_values(by='mae')
a

Unnamed: 0,alpha,max_iter,tol,mae,n_zeros
23,0.00001,10000.0,1.000000e-08,0.026435,0.0
6,0.00001,100.0,1.000000e-07,0.026435,0.0
13,0.00001,1000.0,1.000000e-06,0.026435,0.0
22,0.00001,10000.0,1.000000e-07,0.026435,0.0
7,0.00001,100.0,1.000000e-08,0.026435,0.0
...,...,...,...,...,...
135,1.00000,1000.0,1.000000e-08,0.159742,5.0
141,1.00000,10000.0,1.000000e-06,0.159742,5.0
125,1.00000,100.0,1.000000e-06,0.159743,5.0
132,1.00000,1000.0,1.000000e-05,0.159743,5.0


In [231]:
print(np.unique(a['n_zeros']))

[0. 2. 3. 4. 5.]


In [232]:
best_0_zeros = a[a['n_zeros'] == 0].sort_values(by='mae').iloc[0]
best_0_zeros

alpha       1.000000e-05
max_iter    1.000000e+04
tol         1.000000e-08
mae         2.643486e-02
n_zeros     0.000000e+00
Name: 23, dtype: float64

In [233]:
best_2_zeros = a[a['n_zeros'] == 2].sort_values(by='mae').iloc[0]
best_2_zeros

alpha         0.001000
max_iter    100.000000
tol           0.000001
mae           0.026449
n_zeros       2.000000
Name: 53, dtype: float64

In [234]:
best_4_zeros = a[a['n_zeros'] == 4].sort_values(by='mae').iloc[0]
best_4_zeros

alpha          0.010000
max_iter    1000.000000
tol            0.001000
mae            0.026694
n_zeros        4.000000
Name: 82, dtype: float64

In [235]:
best_5_zeros = a[a['n_zeros'] == 5].sort_values(by='mae').iloc[0]
best_5_zeros

alpha          0.100000
max_iter    1000.000000
tol            0.100000
mae            0.032235
n_zeros        5.000000
Name: 104, dtype: float64