In [141]:
import numpy as np
import matplotlib.pyplot as plt

In [142]:
def lr_schadule(t, a=1, b=1000):
    return a/(b+t)

def linear_predict(X, w):
    return X@w

def mse_loss(y_hat, y):
    error = y_hat-y
    loss = np.sum(error*error)/y.shape[0]
    return error, loss

def calc_gradient(X, error):
    return X.T@error

def update_weights(w, lr, gradient):
    return w-(lr*gradient)

def SGD(X, y, w, lr, n_epochs):
    m = len(X)
    for epoch in range(n_epochs):
        for iteration in range(m):
            idx = np.random.randint(0,m)
            xi = X[idx].reshape(1,-1)
            yi = y[idx].reshape(1,-1)
            y_hat = linear_predict(xi, w)
            error, loss = mse_loss(y_hat, yi)
            gradient = calc_gradient(xi, error)
            lr = lr_schadule(epoch*iteration)
            w = update_weights(w, lr, gradient)
    return w


In [143]:
from sklearn.model_selection import train_test_split
import pandas as pd
import itertools

In [144]:
df = pd.read_csv('../data/fuelConsumption.csv')
y = df[['CO2EMISSIONS']].values

In [145]:
cols = ['FUELCONSUMPTION_CITY','FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB']
subsets = []
for r in range(1,len(cols)+1):
    subsets.extend([list(subset) for subset in itertools.combinations(cols, r)])

subsets

[['FUELCONSUMPTION_CITY'],
 ['FUELCONSUMPTION_HWY'],
 ['FUELCONSUMPTION_COMB'],
 ['FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY'],
 ['FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_COMB'],
 ['FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB'],
 ['FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB']]

In [146]:
np.random.seed(7)
n_epochs = 100
lr = 0.001
result = []
for subset in subsets:
    x = df[subset].values
    x0 = np.ones((x.shape[0],1))
    X = np.concatenate((x0,x), axis=1)
    w= np.random.rand(X.shape[1],1)
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    w = SGD(x_train, y_train, w, lr, n_epochs)
    y_pred = linear_predict(x_test,w)
    _, test_loss_2 = mse_loss(y_pred, y_test)

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    w = SGD(x_train, y_train, w, lr, n_epochs)
    y_pred = linear_predict(x_test,w)
    _, test_loss_3 = mse_loss(y_pred, y_test)
    result.append((','.join(subset), test_loss_2, test_loss_3))
result = pd.DataFrame(result, columns=['cols','MSE_0.2','MSE_0.3'])
result = result.sort_values(by='MSE_0.2').reset_index(drop=True)
result
        

Unnamed: 0,cols,MSE_0.2,MSE_0.3
0,FUELCONSUMPTION_CITY,733.20094,756.078211
1,FUELCONSUMPTION_COMB,810.100965,822.323909
2,"FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY",846.574652,815.668272
3,"FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB",855.822186,767.170271
4,"FUELCONSUMPTION_CITY,FUELCONSUMPTION_COMB",901.852296,680.035297
5,FUELCONSUMPTION_HWY,924.745252,1033.791466
6,"FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELC...",1003.270142,872.344819
