In [44]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [37]:
class LinearRegression:
    
    def step_gradient(self, x, y, learning_rate, w, b):
        for i in range(len(x)):
            pred = np.dot(x[i], w) + b
            for j in range(len(w)):
                w[j] += learning_rate * (y[i] - pred) * x[i][j]
            b += learning_rate * (y[i] - pred)
        return w, b

    def cost(self, x, y, w, b):
        total_cost = 0
        for i in range(len(x)):
            pred = np.dot(x[i], w) + b
            total_cost += (y[i] - pred) ** 2
        return total_cost / len(x)

    def fit(self, x_train, y_train, learning_rate, maxItr = 100):
        N = x_train.shape[1]
        w = [0 for i in range(N)]
        b = 0
        print('START: ', self.cost(x_train, y_train, w, b))
        for i in range(maxItr):
            w, b = self.step_gradient(x_train, y_train, learning_rate, w, b)
        print('FINAL: ', self.cost(x_train, y_train, w, b))
        return w, b
    
    def predict(self, x_test, w, b):
        predictions = []
        for x in x_test:
            predictions.append(np.dot(x, w) + b)
        return np.array(predictions)
    
    def r2_score(self, x_test, y_test, w, b):
        y_pred = self.predict(x_test, w, b)
        tot = ((y_test - y_test.mean()) ** 2).sum()
        res = ((y_test - y_pred) ** 2).sum()
        score = 1 - (res / tot)
        return score

In [24]:
df_train = pd.read_csv('AIR QUALITY - TRAIN.csv')
df_test = pd.read_csv('AIR QUALITY - TEST.csv')

In [34]:
# training data
x_train = np.array([df_train[df_train.columns[i]] for i in range(len(df_train.columns)-1)]).T
y_train = np.array(df_train.target)

# test data
x_test = np.array([df_test[df_test.columns[i]] for i in range(len(df_test.columns)-1)]).T
y_test = np.array(df_test.target)

### Using our custom built Linear Regression

In [38]:
clf = LinearRegression()
w, b = clf.fit(x, y, 0.0001)

START:  12256.130414
FINAL:  415.558306116


In [39]:
print(clf.r2_score(x_test, y_test, w, b))

0.969139950243


### Using Linear Regression from sklearn

In [51]:
clf = LinearRegression()
clf.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [50]:
print(clf.score(x_test, y_test))

0.969148840473
