In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [16]:
class LinearRegression:
    
    def __init__(self, lr, epochs , tol):
        
        self.epoch = []
        self.train_loss = []
        self.val_loss = []
        self.weights = None
        self.train_weight = []
        self.lr = lr
        self.epochs = epochs
        self.tol = tol
        
    def fit(self, X, y, x_val, y_val, lr=0.001, epochs=100, batch_size=None, tol = 1.0E-05):
        
        n = X.shape[0] # number of data
        m = X.shape[1] # number of feature
        X = np.hstack((X, np.ones((n, 1))))
        x_val = np.hstack((x_val, np.ones((x_val.shape[0], 1))))
        
        #randomize the weights
        self.weights = np.random.randn(X.shape[1])
        
        for epoch in range(self.epochs):
            
            if batch_size is not None:
                for batch in range(len(X)//batch_size):
                    
                    X_batch = X[batch*batch_size:(batch+1)*batch_size]
                    y_batch = y[batch*batch_size:(batch+1)*batch_size]
                    y_pred = np.dot(X_batch, self.weights)
                    grad = np.dot(X_batch.T, y_pred-y_batch) / len(X_batch)
                    self.weights -= self.lr * grad
            else:
                y_pred = np.dot(X, self.weights)
                grad = np.dot(X.T, y_pred-y) / len(X)
                self.weights -= self.lr * grad
            
            self.epoch.append(epoch)
            self.train_loss.append(self.get_loss(X, y))
            self.train_weight.append(self.weights)
            self.val_loss.append(self.get_loss(x_val, y_val))
            '''
            if epoch % 100000 == 0:
                val_loss = self.get_loss(x_val, y_val)
                print(f'No. {epoch} Step, loss = {self.train_loss[-1]:.2e}, val_loss = {val_loss:.2e}')
                
                print(self.weights[-1])
                print(self.weights[:-1])
            '''
            if epoch > 3 and abs(self.train_loss[-1] - self.train_loss[-2]) < self.tol:
                print(f'Stop in {epoch} steps')
                break
        

    def get_loss(self, X, y):
        y_pred = np.dot(X, self.weights)
        return np.mean((y_pred - y)**2)

    def predict(self, X):
        X = np.hstack((X, np.ones((X.shape[0], 1))))
        return np.dot(X, self.weights)
                
    def evaluate(self, X, y):
        y_pred = self.predict(X)
        return np.mean((y_pred - y)**2)
        
    def plot_curve(self):
        plt.plot(self.epoch, self.train_loss)
        plt.xlabel('Epoch')
        plt.ylabel('Mean Squared Error Loss')
        plt.title('Training Curve')
        plt.show()


load data and mapping
=

In [3]:
df_train = pd.DataFrame(pd.read_csv("./regression_train.csv"))
df_val   = pd.DataFrame(pd.read_csv("./regression_val.csv"))
df_test  = pd.DataFrame(pd.read_csv("./regression_test.csv"))

smoker_map = {"no" : 0, "yes" : 1}
sex_map = {"female" : 0, "male" : 1}
region_map = {"northeast" : 0,
              "northwest" : 1,
              "southeast" : 2,
              "southwest" : 3 }

df_train["sex"] = df_train["sex"].map(sex_map)
df_train["smoker"] = df_train["smoker"].map(smoker_map)
df_train["region"] = df_train["region"].map(region_map)

df_val["sex"] = df_val["sex"].map(sex_map)
df_val["smoker"] = df_val["smoker"].map(smoker_map)
df_val["region"] = df_val["region"].map(region_map)

df_test["sex"] = df_test["sex"].map(sex_map)
df_test["smoker"] = df_test["smoker"].map(smoker_map)
df_test["region"] = df_test["region"].map(region_map)

For Single Feature regression
=

In [4]:
x_train = df_train.drop(['charges'], axis=1)
y_train = df_train['charges'].to_numpy()
x_train = x_train[['bmi']].to_numpy()

x_val = df_val.drop(['charges'], axis=1)
y_val = df_val['charges'].to_numpy()
x_val = x_val[['bmi']].to_numpy()

x_test = df_test.drop(['charges'], axis=1)
x_test = x_test[['bmi']].to_numpy()

In [5]:
Single_Feature = LinearRegression(lr= 1.0E-3, epochs= 300000, tol = 0)
Single_Feature.fit(x_train, y_train, x_val, y_val)
print(f'Intercepts {Single_Feature.weights[-1]}')
print(f'weights =  {Single_Feature.weights[:-1]}')

Intercepts 1382.5127641092254
weights =  [380.13540621]


In [6]:
print('training loss: {:.2e}'.format(Single_Feature.train_loss[-1]))
print('validation loss: {:.2e}'.format(Single_Feature.val_loss[-1]))

training loss: 1.40e+08
validation loss: 1.37e+08


In [7]:
test_pred = Single_Feature.predict(x_test)

For Multiple Features regression
=

In [8]:
x_train = df_train.drop(['charges'], axis=1).to_numpy()
y_train = df_train['charges'].to_numpy()

x_val = df_val.drop(['charges'], axis=1).to_numpy()
y_val = df_val['charges'].to_numpy()

x_test = df_test.drop(['charges'], axis=1).to_numpy()

In [9]:
Multi_Feature = LinearRegression(lr = 5.0E-4, epochs= 1000000, tol = 0)
Multi_Feature.fit(x_train, y_train, x_val, y_val)

print(f'Intercepts {Multi_Feature.weights[-1]}')
print(f'weights =  {Multi_Feature.weights[:-1]}')
print('training loss: {:.2e}'.format(Multi_Feature.train_loss[-1]))
print('validation loss: {:.2e}'.format(Multi_Feature.val_loss[-1]))
test_pred = Multi_Feature.predict(x_test)

Intercepts -11857.04933116656
weights =  [  259.85082668  -383.5457614    333.33233366   442.55734611
 24032.22066281  -416.01454027]
training loss: 3.47e+07
validation loss: 4.20e+07


My Model
=

In [10]:
df_train = pd.DataFrame(pd.read_csv("./regression_train.csv"))
df_val   = pd.DataFrame(pd.read_csv("./regression_val.csv"))
df_test  = pd.DataFrame(pd.read_csv("./regression_test.csv"))

smoker_map = {"no" : 0, "yes" : 1}
sex_map = {"female" : 0, "male" : 1}

df_train["sex"] = df_train["sex"].map(sex_map)
df_train["smoker"] = df_train["smoker"].map(smoker_map)

df_val["sex"] = df_val["sex"].map(sex_map)
df_val["smoker"] = df_val["smoker"].map(smoker_map)

df_test["sex"] = df_test["sex"].map(sex_map)
df_test["smoker"] = df_test["smoker"].map(smoker_map)

One Hot Encodeing and To numpy
=

In [11]:
df_train = pd.get_dummies(df_train, columns=["region"])
df_val = pd.get_dummies(df_val, columns=["region"])
df_test = pd.get_dummies(df_test, columns=["region"])

# min max normalization (age)
# for all data min is 18 max is 64
df_train["age"] = (df_train["age"] - 18) / 46
df_val["age"] = (df_val["age"] - 18) / 46
df_test["age"] = (df_test["age"] - 18) / 46

# min max normalization (bmi)
bmi_min = df_train["bmi"].min()
bmi_max = df_train["bmi"].max()

df_train["bmi"] = (df_train["bmi"] - bmi_min) / (bmi_max - bmi_min)
df_val["bmi"] = (df_val["bmi"] - bmi_min) / (bmi_max - bmi_min)
df_test["bmi"] = (df_test["bmi"] - bmi_min) / (bmi_max - bmi_min)
'''
# min max normalization (charges)
charges_min = df_train["charges"].min()
charges_max = df_train["charges"].max()

df_train["charges"] = (df_train["charges"] - charges_min) / (charges_max - charges_min)
df_val["charges"] = (df_val["charges"] - charges_min) / (charges_max - charges_min)
df_test["charges"] = (df_test["charges"] - charges_min) / (charges_max - charges_min)
'''

'\n# min max normalization (charges)\ncharges_min = df_train["charges"].min()\ncharges_max = df_train["charges"].max()\n\ndf_train["charges"] = (df_train["charges"] - charges_min) / (charges_max - charges_min)\ndf_val["charges"] = (df_val["charges"] - charges_min) / (charges_max - charges_min)\ndf_test["charges"] = (df_test["charges"] - charges_min) / (charges_max - charges_min)\n'

In [12]:

x_train = df_train.drop(['charges'], axis=1).to_numpy()
y_train = df_train['charges'].to_numpy()

x_val = df_val.drop(['charges'], axis=1).to_numpy()
y_val = df_val['charges'].to_numpy()

x_test = df_test.drop(['charges'], axis=1).to_numpy()


In [13]:
MyModel = LinearRegression(lr = 5.0E-4, epochs= 1000000, tol = 0)
MyModel.fit(x_train, y_train, x_val, y_val)
print(f'Intercepts {MyModel.weights[-1]}')
print(f'weights =  {MyModel.weights[:-1]}')
print('training loss: {:.2e}'.format(MyModel.train_loss[-1]))
print('validation loss: {:.2e}'.format(MyModel.val_loss[-1]))

Intercepts -2045.3873825040034
weights =  [11909.78031819  -399.73821501 11724.13696648   437.64932999
 24068.28936178   219.36931214  -297.17589767 -1034.30767236
  -929.11629117]
training loss: 3.47e+07
validation loss: 4.21e+07


In [14]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(x_train)
X_poly_val = poly.fit_transform(x_val)

reg = LinearRegression().fit(X_poly, y_train)

In [None]:
Poly_model = LinearRegression(lr = 5.0E-4, epochs= 1000000, tol = 0)
Poly_model.fit(X_poly, y_train, X_poly_val, y_val)

In [None]:
import numpy as np

def polynomial_features(X, degree):
    n_samples, n_features = X.shape
    output = np.ones((n_samples, 1))

    for d in range(1, degree+1):
        for i in range(n_features):
            output = np.hstack((output, np.power(X[:, i:i+1], d)))

    
    return output

x = np.arange(1,3).reshape(1,2)
print(x)
print(PolynomialFeatures(degree=2, interaction_only=True).fit_transform(x))
print(PolynomialFeatures(degree=2) .fit_transform(x))