In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as pl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import SGDRegressor

class Data:
  def __init__(self, loc):
    self.data = pd.read_csv(loc)
    header = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']
    self.data.columns = header

  def getData(self):
    return self.data


  def getFeaturesAndTarget(self, Data):  
    X = Data[["highway-mpg", "curb-weight", "horsepower", "engine-size"]]
    Y = Data[["price"]]
    Y = Y.values.ravel()
    s = StandardScaler()
    X = pd.DataFrame(s.fit(X).fit_transform(X))
    return (X, Y)

  def getDataSplit(self, X, Y):  
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
    return (X_train, X_test, Y_train, Y_test)



class Model:
  model = SGDRegressor()
  def __init__(self, X, Y):
    self.model.fit(X,Y)

  def pred(self, X):  
    return self.model.predict(X);

  def rmseAndR2(self, expected, predicted):
    rmse = (np.sqrt(mean_squared_error(expected, predicted)))
    r2 = r2_score(expected, predicted)
    return (rmse,r2)




if __name__ == "__main__":
    data = Data('imports-85.data'); 
    df = data.getData();
    df = data.processData(df);
    X, Y = data.getFeaturesAndTarget(df)
    X_train, X_test, Y_train, Y_test = data.getDataSplit(X, Y);
    sgdLinear = Model(X_train, Y_train)
    Y_pred_test = sgdLinear.pred(X_test);
    rmse, r2 = sgdLinear.rmseAndR2(Y_test, Y_pred_test);
    print(rmse, r2)


In [55]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as pl
#from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.metrics import mean_squared_error
#from sklearn.metrics import r2_score
#from sklearn.linear_model import SGDRegressor

class Data:
  def __init__(self, loc):
    self.data = pd.read_csv(loc)
    header = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']
    self.data.columns = header

  def getData(self):
    return self.data

  def processData(self):

    self.data.replace('?',np.nan, inplace=True)
    self.data.dropna(inplace=True)

    self.data['normalized-losses'] = self.data['normalized-losses'].astype('int')
    self.data['bore'] = self.data['bore'].astype('float')
    self.data['stroke'] = self.data['stroke'].astype('float')
    self.data['horsepower'] = self.data['horsepower'].astype('float')
    self.data['peak-rpm'] = self.data['peak-rpm'].astype('int')
    self.data['price'] = self.data['price'].astype('int')
    self.data = self.data.select_dtypes(exclude=['object'])

  def setFeatureAndTarget(self):  
    self.data = self.data[["highway-mpg", "curb-weight", "horsepower", "engine-size", "price"]]
    #Y = self.data[["price"]]
    #Y = Y.values.ravel()
    #s = StandardScaler()
    #X = pd.DataFrame(s.fit(X).fit_transform(X))
    #return (X, Y)

  def getDataSplit(self):  
    # Define a size for your train set 
    train_size = int(0.7 * len(self.data))
    
    # Split your dataset 
    train_set = self.data[:train_size]
    test_set = self.data[train_size:]
    X_train = train_set.iloc[:, :-1]
    X_test = test_set.iloc[:, :-1]
    Y_train = train_set.iloc[:, -1]
    Y_test = test_set.iloc[:, -1]

    X_train = X_train.to_numpy() 
    X_test = X_test.to_numpy() 
    Y_train = Y_train.to_numpy() 
    Y_test = Y_test.to_numpy() 

    #X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
    return (X_train, X_test, Y_train, Y_test)





# if __name__ == "__main__":
#     data = Data('imports-85.data'); 
#     df = data.getData();
#     df = data.processData(df);
#     X, Y = data.getFeaturesAndTarget(df)
#     X_train, X_test, Y_train, Y_test = data.getDataSplit(X, Y);
#     sgdLinear = Model(X_train, Y_train)
#     Y_pred_test = sgdLinear.pred(X_test);
#     rmse, r2 = sgdLinear.rmseAndR2(Y_test, Y_pred_test);
#     print(rmse, r2)


In [56]:
data = Data('imports-85.data')

In [57]:
data.getData()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,177.3,66.3,53.1,2507,ohc,five,136,mpfi,3.19,3.40,8.5,110,5500,19,25,15250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,2952,ohc,four,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
200,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,188.8,68.8,55.5,3049,ohc,four,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
201,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3012,ohcv,six,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
202,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3217,ohc,six,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


In [58]:
data.processData()

In [59]:
data.getData()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
2,2,164,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,10.0,102.0,5500,24,30,13950
3,2,164,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,8.0,115.0,5500,18,22,17450
5,1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.40,8.5,110.0,5500,19,25,17710
7,1,158,105.8,192.7,71.4,55.9,3086,131,3.13,3.40,8.3,140.0,5500,17,20,23875
9,2,192,101.2,176.8,64.8,54.3,2395,108,3.50,2.80,8.8,101.0,5800,23,29,16430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,-1,95,109.1,188.8,68.9,55.5,2952,141,3.78,3.15,9.5,114.0,5400,23,28,16845
200,-1,95,109.1,188.8,68.8,55.5,3049,141,3.78,3.15,8.7,160.0,5300,19,25,19045
201,-1,95,109.1,188.8,68.9,55.5,3012,173,3.58,2.87,8.8,134.0,5500,18,23,21485
202,-1,95,109.1,188.8,68.9,55.5,3217,145,3.01,3.40,23.0,106.0,4800,26,27,22470


In [60]:
data.setFeatureAndTarget()

In [61]:
data.getData()

Unnamed: 0,highway-mpg,curb-weight,horsepower,engine-size,price
2,30,2337,102.0,109,13950
3,22,2824,115.0,136,17450
5,25,2844,110.0,136,17710
7,20,3086,140.0,131,23875
9,29,2395,101.0,108,16430
...,...,...,...,...,...
199,28,2952,114.0,141,16845
200,25,3049,160.0,141,19045
201,23,3012,134.0,173,21485
202,27,3217,106.0,145,22470


In [62]:
X_train, X_test, Y_train, Y_test = data.getDataSplit()

In [63]:
X_train.shape

(111, 4)

In [64]:
Y_train.shape

(111,)

In [65]:
X_train

array([[  30., 2337.,  102.,  109.],
       [  22., 2824.,  115.,  136.],
       [  25., 2844.,  110.,  136.],
       [  20., 3086.,  140.,  131.],
       [  29., 2395.,  101.,  108.],
       [  29., 2395.,  101.,  108.],
       [  28., 2710.,  121.,  164.],
       [  28., 2765.,  121.,  164.],
       [  53., 1488.,   48.,   61.],
       [  43., 1874.,   70.,   90.],
       [  43., 1909.,   70.,   90.],
       [  41., 1876.,   68.,   90.],
       [  38., 1876.,   68.,   90.],
       [  30., 2128.,  102.,   98.],
       [  38., 1967.,   68.,   90.],
       [  38., 1989.,   68.,   90.],
       [  38., 1989.,   68.,   90.],
       [  30., 2535.,   88.,  122.],
       [  24., 2811.,  145.,  156.],
       [  54., 1713.,   58.,   92.],
       [  38., 1819.,   76.,   92.],
       [  42., 1837.,   60.,   79.],
       [  34., 1940.,   76.,   92.],
       [  34., 1956.,   76.,   92.],
       [  34., 2010.,   76.,   92.],
       [  34., 2024.,   76.,   92.],
       [  33., 2236.,   86.,  110.],
 

In [66]:
Y_train

array([13950, 17450, 17710, 23875, 16430, 16925, 20970, 21105,  5151,
        6295,  6575,  5572,  6377,  7957,  6229,  6692,  7609,  8921,
       12964,  6479,  6855,  5399,  6529,  7129,  7295,  7295,  7895,
        9095,  8845, 10295, 12945, 10345, 32250,  5195,  6095,  6795,
        6695,  7395,  8845,  8495, 10595, 10245, 11245, 18280, 25552,
       28248, 28176, 31600, 35056,  5389,  6189,  6669,  7689,  9959,
        8499,  6989,  8189,  9279,  9279,  5499,  7099,  6649,  6849,
        7349,  7299,  7799,  7499,  7999,  8249,  8949,  9549, 13499,
       14399, 13499, 17199, 19699, 18399, 11900, 13200, 15580, 16900,
       16630, 17950, 18150,  5572,  7957,  6229,  6692,  7609,  8921,
       22018, 11850, 12170, 15040, 15510, 18150, 18620,  5118,  7053,
        7603,  7126,  7775,  9960,  9233, 11259,  7463, 10198,  8013,
       11694,  5348,  6338])

In [67]:
X_test

array([[  38., 2015.,   62.,   92.],
       [  37., 2280.,   62.,   92.],
       [  32., 2290.,   62.,   92.],
       [  32., 3110.,   62.,   92.],
       [  37., 2081.,   70.,   98.],
       [  37., 2109.,   70.,   98.],
       [  36., 2275.,   56.,  110.],
       [  47., 2275.,   56.,  110.],
       [  47., 2094.,   70.,   98.],
       [  34., 2122.,   70.,   98.],
       [  34., 2140.,   70.,   98.],
       [  34., 2169.,   70.,   98.],
       [  34., 2204.,   70.,   98.],
       [  29., 2265.,  112.,   98.],
       [  29., 2300.,  112.,   98.],
       [  30., 2540.,  116.,  146.],
       [  30., 2536.,  116.,  146.],
       [  30., 2551.,  116.,  146.],
       [  30., 2679.,  116.,  146.],
       [  30., 2714.,  116.,  146.],
       [  30., 2975.,  116.,  146.],
       [  34., 2326.,   92.,  122.],
       [  33., 2480.,   73.,  110.],
       [  32., 2414.,   92.,  122.],
       [  32., 2414.,   92.,  122.],
       [  32., 2458.,   92.,  122.],
       [  24., 2976.,  161.,  171.],
 

In [68]:
Y_test

array([ 6488,  6918,  7898,  8778,  6938,  7198,  7898,  7788,  7738,
        8358,  9258,  8058,  8238,  9298,  9538,  8449,  9639,  9989,
       11199, 11549, 17669,  8948, 10698,  9988, 10898, 11248, 16558,
       15998, 15690,  7775,  7975,  7995,  8195,  8495,  9495,  9995,
        9980, 12940, 13415, 15985, 16515, 18420, 18950, 16845, 19045,
       21485, 22470, 22625])

In [75]:
class Model:
  def __init__(self):
    self.weights = [] 
    self.bias = 0

  def update(self, X, Y, Y_pred, alpha):  
    n = len(Y)
    w = 0.0
    b = 0.0
    for i in range(n):
      print("input: ", X[i])
      print("expected: ", Y[i], "predicted: ", Y_pred[i])
      w += -X[i].T*(Y[i]-Y_pred[i])
      b += -(Y[i]-Y_pred[i])
    print("w: ",w)  
    print("b: ",b)  
    self.weights = self.weights - (alpha * ((2*w/float(n))))  
    self.bias = self.bias - (alpha * ((2*b)/float(n)))

  def train(self, X, Y, alpha, epoch):  
    self.weights  = np.random.randn(X.shape[1])
    for _ in range(epoch):
      print(self.weights)
      Y_pred = self.predict(X)
      self.update(X, Y, Y_pred, alpha)
      print(self.weights)
      if _%200 == 0:
        print("epoch: ", _, "loss: ", self.loss(Y, Y_pred))
           
  def  predict(self, X):
    Y_pred=[]
    n = len(X)
    for i in range(n):
        Y_pred.append(X[i].dot(self.weights)+self.bias) 
    print(self.bias)    
    print(Y_pred)    
    return np.array(Y_pred)      

  def loss(self, Y, Y_pred):
      n = len(Y)
      l = 0
      for i in range(n):
          l+=(Y[i]-Y_pred[i])**2
      return (1/n)*l

In [76]:
linearRegression = Model()

In [78]:
linearRegression.train(X_train, Y_train, 0.001, 200)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
expected:  7349 predicted:  1.0802054962510714e+214
input:  [  37. 1951.   69.   97.]
expected:  7299 predicted:  1.0413978037728712e+214
input:  [  37. 2028.   69.   97.]
expected:  7799 predicted:  1.0823319451539865e+214
input:  [  37. 1971.   69.   97.]
expected:  7499 predicted:  1.0520300482874466e+214
input:  [  37. 2037.   69.   97.]
expected:  7999 predicted:  1.0871164551855457e+214
input:  [  37. 2008.   69.   97.]
expected:  8249 predicted:  1.0716997006394112e+214
input:  [  34. 2324.   97.  120.]
expected:  8949 predicted:  1.240858048402257e+214
input:  [  34. 2302.   97.  120.]
expected:  9549 predicted:  1.2291625794362242e+214
input:  [  22. 3095.  152.  181.]
expected:  13499 predicted:  1.6534003911215923e+214
input:  [  22. 3296.  152.  181.]
expected:  14399 predicted:  1.7602544484930753e+214
input:  [  25. 3060.  152.  181.]
expected:  13499 predicted:  1.6348134934329447e+214
input:  [  25. 3071. 

  del sys.path[0]


8499 predicted:  1.218170948725149e+304
input:  [  32. 2365.   88.  122.]
expected:  6989 predicted:  1.2374502615184196e+304
input:  [  32. 2405.   88.  122.]
expected:  8189 predicted:  1.258292761835469e+304
input:  [  30. 2403.  116.  110.]
expected:  9279 predicted:  1.2575070905129803e+304
input:  [  30. 2403.  116.  110.]
expected:  9279 predicted:  1.2575070905129803e+304
input:  [  37. 1889.   69.   97.]
expected:  5499 predicted:  9.884256372928813e+303
input:  [  50. 2017.   55.  103.]
expected:  7099 predicted:  1.0550699820186136e+304
input:  [  37. 1918.   69.   97.]
expected:  6649 predicted:  1.003536450022742e+304
input:  [  37. 1938.   69.   97.]
expected:  6849 predicted:  1.0139577001812667e+304
input:  [  37. 2024.   69.   97.]
expected:  7349 predicted:  1.0587690758629226e+304
input:  [  37. 1951.   69.   97.]
expected:  7299 predicted:  1.0207315127843077e+304
input:  [  37. 2028.   69.   97.]
expected:  7799 predicted:  1.0608533258946276e+304
input:  [  37. 19



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
expected:  20970 predicted:  nan
input:  [  28. 2765.  121.  164.]
expected:  21105 predicted:  nan
input:  [  53. 1488.   48.   61.]
expected:  5151 predicted:  nan
input:  [  43. 1874.   70.   90.]
expected:  6295 predicted:  nan
input:  [  43. 1909.   70.   90.]
expected:  6575 predicted:  nan
input:  [  41. 1876.   68.   90.]
expected:  5572 predicted:  nan
input:  [  38. 1876.   68.   90.]
expected:  6377 predicted:  nan
input:  [  30. 2128.  102.   98.]
expected:  7957 predicted:  nan
input:  [  38. 1967.   68.   90.]
expected:  6229 predicted:  nan
input:  [  38. 1989.   68.   90.]
expected:  6692 predicted:  nan
input:  [  38. 1989.   68.   90.]
expected:  7609 predicted:  nan
input:  [  30. 2535.   88.  122.]
expected:  8921 predicted:  nan
input:  [  24. 2811.  145.  156.]
expected:  12964 predicted:  nan
input:  [  54. 1713.   58.   92.]
expected:  6479 predicted:  nan
input:  [  38. 1819.   76.   92.]
expected