In [192]:
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [193]:
df = pd.read_csv('FuelConsumption.csv')
df.head(1)

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196


In [194]:
cols = ['FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB','FUELCONSUMPTION_COMB_MPG']
combinations = []
for i in range(1, len(cols)+1):
    combinations.extend([list(x) for x in itertools.combinations(cols,i)])

combinations

[['FUELCONSUMPTION_HWY'],
 ['FUELCONSUMPTION_COMB'],
 ['FUELCONSUMPTION_COMB_MPG'],
 ['FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB'],
 ['FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB_MPG'],
 ['FUELCONSUMPTION_COMB', 'FUELCONSUMPTION_COMB_MPG'],
 ['FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB', 'FUELCONSUMPTION_COMB_MPG']]

In [195]:
y = df['CO2EMISSIONS'].values
x0 = np.ones((len(df),1))
n_epochs = 100


In [196]:
def calc_lr(t, a=0.01, b=200):
    return a/(b+t)

def reg_predict(X, w):
    return np.dot(X, w)


def mse_loss(error):
    return np.sum(error*error)/error.shape[0]

def cal_gradient(X, error):
    return (1/len(X))*(np.dot(X.T,error))


def update_weight(w, lr, gradient):
    return w-(lr*gradient)


def train_SGD(X,y,w,n_epochs):
    losses = []
    W = []
    m = len(X)
    for epoch in range(n_epochs):
        for iteration in range(m):
            lr = calc_lr(epoch+iteration)
            i = np.random.randint(0,m)
            xi = X[i:i+1]
            yi = y[i:i+1]
            y_hat = reg_predict(xi, w)
            error = y_hat-yi
            loss = mse_loss(error)
            gradient = cal_gradient(xi, error)
            w = update_weight(w, lr, gradient)
            W.append(w)
            losses.append(loss)
    return losses[-1], W[-1]

def train_MGD(X,y,w,n_epochs,batch_size):
    losses = []
    W = []
    m = len(X)
    for epoch in range(n_epochs):
        new_index = np.random.permutation(len(X))
        new_x = X[new_index]
        new_y = y[new_index]
        for iteration in range(0,m,batch_size):
            lr = calc_lr(epoch+iteration/batch_size)
            xi = new_x[iteration:iteration+batch_size]
            yi = new_y[iteration:iteration+batch_size]
            y_hat = reg_predict(xi, w)
            error = y_hat-yi
            loss = mse_loss(error)
            gradient = cal_gradient(xi, error)
            w = update_weight(w, lr, gradient)
            W.append(w)
            losses.append(loss)
    return losses, W

<div dir="rtl">
تمرین یک:
بررسی ترکیبات سه ستون و پیاده سازی sgd  با دو تست سایز متفاوت

In [197]:
y = df['CO2EMISSIONS'].values.reshape(-1, 1)

errors = []
for col in combinations:
  X = df[col].values
  n = len(col)
  x_aug = np.concatenate((x0, X), axis=1)

  x_train, x_test, y_train, y_test = train_test_split(x_aug,y, test_size=0.2)
  w = np.random.rand(n+1, 1)
  _, w_last = train_SGD(x_train,y_train,w,n_epochs)
  y_predict = reg_predict(x_test,w_last)
  loss_02 =np.round(mse_loss(y_predict-y_test),1)

  x_train, x_test, y_train, y_test = train_test_split(x_aug,y, test_size=0.3)
  w = np.random.rand(n+1, 1)
  _, w_last = train_SGD(x_train,y_train,w,n_epochs)
  y_predict = reg_predict(x_test,w_last)
  loss_03 =np.round(mse_loss(y_predict-y_test),1)
 
  errors.append((','.join(col),loss_02,loss_03))

errors_df = pd.DataFrame(errors,columns=['col','loss_0.2','loss_0.3'])
errors_df = errors_df.sort_values(by='loss_0.2')
errors_df.reset_index(drop=True, inplace=True)
errors_df


Unnamed: 0,col,loss_0.2,loss_0.3
0,"FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG",863.1,989.5
1,"FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELC...",914.4,1095.7
2,FUELCONSUMPTION_COMB,994.2,1227.9
3,FUELCONSUMPTION_HWY,1211.0,1203.6
4,"FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB_MPG",1302.7,1263.8
5,"FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB",1321.3,1077.8
6,FUELCONSUMPTION_COMB_MPG,14302.7,12830.8


<div dir="rtl">
تمرین دو:

انتخاب یک ستون از دیتا و اعمال mgd بر روی آن

In [198]:
y = df['CO2EMISSIONS'].values.reshape(-1, 1)
selected_col = ['FUELCONSUMPTION_COMB_MPG']
X = df[selected_col].values
x_aug = np.concatenate((x0, X), axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_aug,y, test_size=0.2)
w = np.random.rand(2, 1)
losses, W = train_MGD(x_train,y_train,w,n_epochs,30)
y_predict = reg_predict(x_test,W[-1])
loss =np.round(mse_loss(y_predict-y_test),1)
loss

np.float64(15975.0)