In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split 
import seaborn as sns
from scipy import stats
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.descriptivestats import sign_test
from clint.textui import progress

In [32]:
# dataset = pd.read_csv('data_v1-01.csv')
# X_base = dataset.iloc[:, 0:1].values
# y_base = dataset.iloc[:, 1].values
data = np.genfromtxt('data_v1-01.csv', delimiter=',', skip_header=True)
X_base = data[:, 0]
y_base = data[:, 1]
N = 50

In [38]:
data.shape

(300, 2)

In [34]:
scaler = StandardScaler()
scaler.fit(X_train.reshape(-1, 1))
X_train = scaler.transform(X_train.reshape(-1, 1))
X_test = scaler.transform(X_test.reshape(-1, 1))
target_scaler = StandardScaler()
target_scaler.fit(y_train.reshape(-1, 1))
y_train = target_scaler.transform(y_train.reshape(-1, 1))
y_test = target_scaler.transform(y_test.reshape(-1, 1))

In [135]:
class RegressionModel:
    def __init__(self, model, X_base, y_base, test_size, m):
        self.model = model
        X_train, X_test, y_train, y_test = train_test_split(X_base, y_base, test_size=test_size, random_state=42)
        scaler = StandardScaler()
        scaler.fit(X_train.reshape(-1, 1))
        self.X_train = scaler.transform(X_train.reshape(-1, 1))
        self.X_test = scaler.transform(X_test.reshape(-1, 1))
        target_scaler = StandardScaler()
        target_scaler.fit(y_train.reshape(-1, 1))
        self.y_train = target_scaler.transform(y_train.reshape(-1, 1))
        self.y_test = target_scaler.transform(y_test.reshape(-1, 1))
        self.test_size = test_size
        self.min = np.min(X_train, axis=0)
        self.max = np.max(X_train, axis=0)
        self.m = m
        self.transformed_X_train = self.transform(self.X_train)
        self.transformed_X_test = self.transform(self.X_test)
        self.fit()
    
    def transform(self, X):
        c = np.linspace(self.min, self.max, self.m)
        b = np.full(c.shape, 1)
        transformed = c[0]*X+b
        for i in range(1, self.m):
            transformed = np.c_[transformed, c[i]*(X**i)]
        if (self.m == 1):
            return transformed.reshape(-1, 1)
        return transformed
    
    
    def fit(self):
        self.model.fit(self.transformed_X_train, self.y_train)
    
    def predict(self, X):
        return self.model.predict(X)
    
    def predict_train(self):
        return self.model.predict(self.transformed_X_train)
    
    def predict_test(self):
        return self.model.predict(self.transformed_X_test)
    
    def mse_train(self):
        y_pred = self.predict_train()
        return mean_squared_error(self.y_train, y_pred)
    
    def mse_test(self):
        y_pred = self.predict_test()
        return mean_squared_error(self.y_test, y_pred)
    
    def bias_train(self):
        return self.y_train - self.predict_train()
    
    def bias_test(self):
        return self.y_test - self.predict_test()

In [90]:
def prepare_plot_xy(px, py):
    to_plot = sorted([(x, y) for x, y in zip(px, py)], key=lambda a: a[0][0])
    x_plot = [i[0] for i in to_plot]
    y_plot = [i[1] for i in to_plot]
    return (x_plot, y_plot)

In [136]:
mse_test = []
mse_train = []
for i in range(1, N+1):
    lin_regr = linear_model.LinearRegression()
    lin_model = RegressionModel(lin_regr, X_base, y_base, test_size=0.3, m=i)
    mse_test.append(lin_model.mse_test())
    mse_train.append(lin_model.mse_train())

f = plt.figure()
plt.plot(range(1, N+1), mse_test, label='mse_test', color='blue')
plt.plot(range(1, N+1), mse_train, label='mse_train', color='green')
plt.yscale('log')
#plt.plot(range(2, 9), mse[2], label='mse')
plt.title(f'MSE (number of regressors)')
plt.xlabel('m')
plt.ylabel('Mse log')
plt.legend()
plt.savefig(f'MSE_of_m.png',dpi=300)
plt.close(f)

In [92]:


for m in [1, 4, 15]:
    lin_regr = linear_model.LinearRegression()
    lin_model = RegressionModel(lin_regr, X_base, y_base, test_size=0.3, m=m)
    f = plt.figure()
    plt.scatter(lin_model.X_test, lin_model.y_test, color='red')
    plt.scatter(lin_model.X_train, lin_model.y_train, color='red')
    colors =['blue', 'green', 'purple']
    tmp = pd.DataFrame(np.c_[lin_model.X_test, lin_model.predict_test()], columns=['X', 'y']).sort_values(by=['X'])
    plt.plot(tmp['X'], tmp['y'], color='blue', label='test')
    tmp = pd.DataFrame(np.c_[lin_model.X_train, lin_model.predict_train()], columns=['X', 'y']).sort_values(by=['X'])
    plt.plot(tmp['X'], tmp['y'], color='green', label='train')
    plt.title(f'Linear Regression m={m}')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.legend()
    plt.savefig(f'xy_m{m}.png',dpi=300)
    plt.close(f)

In [142]:

for i in [1, 4, 15]:
    mse_test = []
    mse_train = []
    test_sizes = [0.1,0.2,0.3,0.4,0.5]
    for test_size in test_sizes:
        lin_regr = linear_model.LinearRegression()
        lin_model = RegressionModel(lin_regr, X_base, y_base, test_size=test_size, m=i)
        mse_test.append(lin_model.mse_test())
        mse_train.append(lin_model.mse_train())

    f = plt.figure()
    plt.plot(test_sizes, mse_test, label='mse_test', color='blue')
    plt.plot(test_sizes, mse_train, label='mse_train', color='green')
    plt.yscale('log')
    #plt.plot(range(2, 9), mse[2], label='mse')
    plt.title(f'MSE (test_size) m={i}')
    plt.xlabel('test_size')
    plt.ylabel('Mse log')
    plt.legend()
    plt.savefig(f'MSE_of_m{i}.png',dpi=300)
    plt.close(f)

In [143]:

res_var_test = {}
res_var_train = {}
bias_test = {}
bias_train = {}
for i in range(100):
    lin_regr = linear_model.LinearRegression()
    lin_model = RegressionModel(lin_regr, X_base, y_base, test_size=test_size, m=4)
    y_pred_test = lin_model.predict_test()
    for x, y in zip(lin_model.X_test, y_pred_test):
        if x[0] in res_var_test:
            res_var_test[x[0]].append(y[0])
        else:
            res_var_test.update({x[0]:[y[0]]})
    y_pred_train = lin_model.predict_train()
    for x, y in zip(lin_model.X_train, y_pred_train):
        if x[0] in res_var_train:
            res_var_train[x[0]].append(y[0])
        else:
            res_var_train.update({x[0]:[y[0]]})
    
    for x, y in zip(lin_model.X_test, lin_model.bias_test()):
        if x[0] in bias_test:
            bias_test[x[0]].append(y[0])
        else:
            bias_test.update({x[0]:[y[0]]})
            
    for x, y in zip(lin_model.X_train, lin_model.bias_train()):
        if x[0] in bias_train:
            bias_train[x[0]].append(y[0])
        else:
            bias_train.update({x[0]:[y[0]]})

tmp_res = [(x, np.var(y)) for x, y in sorted(res_var_test.items())]
tmp_var_test = pd.DataFrame(tmp_res, columns=['X', 'y']).sort_values(by=['X'])
tmp_res = [(x, np.var(y)) for x, y in sorted(res_var_train.items())]
tmp_var_train = pd.DataFrame(tmp_res, columns=['X', 'y']).sort_values(by=['X'])
tmp_res = [(x, np.average(y)) for x, y in sorted(bias_test.items())]
tmp_bias_test = pd.DataFrame(tmp_res, columns=['X', 'y']).sort_values(by=['X'])
tmp_res = [(x, np.average(y)) for x, y in sorted(bias_train.items())]
tmp_bias_train = pd.DataFrame(tmp_res, columns=['X', 'y']).sort_values(by=['X'])

f = plt.figure()
plt.plot(tmp_var_test['X'], tmp_var_test['y'], label='var_test', color='blue')
plt.plot(tmp_var_train['X'], tmp_var_train['y'], label='var_train', color='green')
plt.yscale('log')
#plt.plot(range(2, 9), mse[2], label='mse')
plt.title(f'Variance (x)')
plt.xlabel('x')
plt.ylabel('Variance log')
plt.legend()
plt.savefig(f'var.png',dpi=300)
plt.close(f)


f = plt.figure()
plt.plot(tmp_bias_test['X'], tmp_bias_test['y'], label='bias_test', color='blue')
plt.plot(tmp_bias_train['X'], tmp_bias_train['y'], label='bias_train', color='green')
#plt.plot(range(2, 9), mse[2], label='mse')
plt.title(f'Bias (x)')
plt.xlabel('x')
plt.ylabel('Bias')
plt.legend()
plt.savefig(f'bias.png',dpi=300)
plt.close(f)


f = plt.figure()
plt.hist(tmp_var_test['y'], 10,density=True, facecolor='blue')
plt.title(f'Variance test')
plt.savefig(f'Bar_var_test.png')
plt.close(f)

f = plt.figure()
plt.hist(tmp_var_train['y'], 10,density=True, facecolor='blue')
plt.title(f'Variance train')
plt.savefig(f'Bar_var_train.png')
plt.close(f)


f = plt.figure()
plt.hist(tmp_bias_test['y'], 10,density=True, facecolor='blue')
plt.title(f'Bias test')
plt.savefig(f'Bar_bias_test.png')
plt.close(f)

f = plt.figure()
plt.hist(tmp_bias_train['y'], 10,density=True, facecolor='blue')
plt.title(f'Bias train')
plt.savefig(f'Bar_bias_train.png')
plt.close(f)

In [141]:
x_num = 40
X = X_base[x_num].reshape(-1,1)
var = []
bias = []
for m in progress.bar(range(1, N+1)):
    print(m)
    y_pred = []
    for i in range(100):
        lin_regr = linear_model.LinearRegression()
        lin_model = RegressionModel(lin_regr, X_base, y_base, test_size=0.3, m=m)
        y_pred.append(lin_model.predict(lin_model.transform(X)))
    var.append(np.var(y_pred))
    bias.append(np.average(y_pred))

    
f = plt.figure()
plt.plot(range(1, N+1), var, color='blue')
plt.yscale('log')
#plt.plot(range(2, 9), mse[2], label='mse')
plt.title(f'Variance (m) x{X}')
plt.xlabel('m')
plt.ylabel('Variance log')
plt.legend()
plt.savefig(f'var_x.png',dpi=300)
plt.close(f)


f = plt.figure()
plt.plot(range(1, N+1), bias, color='blue')
#plt.plot(range(2, 9), mse[2], label='mse')
plt.title(f'Bias (m) x{X}')
plt.xlabel('m')
plt.ylabel('Bias')
plt.legend()
plt.savefig(f'bias_x.png',dpi=300)
plt.close(f)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


No handles with labels found to put in legend.
No handles with labels found to put in legend.
