In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
import math

%matplotlib inline

ModuleNotFoundError: No module named 'sklearn'

### Linear Regression

In [None]:
sns.set()

In [None]:
x = np.array([1,2,2,3]).reshape(-1,1)
y = np.array([1,1,2,2])

In [None]:
model = LinearRegression().fit(x, y)

In [None]:
a0 = model.coef_
b0 = model.intercept_

In [None]:
print(a0[0], b0)

In [None]:
def func(a,b):
    return (1-a-b)**2 + (1-2*a-b)**2 + (2-2*a-b)**2 + (2-3*a-b)**2

In [None]:
def grad(a,b):
    return -2*(1-a-b) - 4*(1-2*a-b) - 4*(2-2*a-b) - 6*(2-3*a-b), -2*(1-a-b) - 2*(1-2*a-b) - 2*(2-2*a-b) - 2*(2-3*a-b)

In [None]:
def plotting(iteration, a, b):
    plt.figure(figsize=(8,6))
    x_1 = [1,2,2,3]
    y_1 = [1,1,2,2]
    plt.scatter(x_1,y_1,color='b')
    x = np.linspace(0, 4, 100)
    y=a*x+b
    plt.plot(x, y, linestyle='dashed', color='g')
    plt.plot(x, a0*x+b0, color='r')
    plt.legend(['$y=ax+b$', 'OLS regression', 'data'], loc='upper right')
    plt.xlim([0, 4])
    plt.ylim([0, 3])
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title('Iteration {}'.format(iteration))
    plt.savefig('./img/it_{}.png'.format(iteration))

In [None]:
def GD(initial, alpha, diff):
    x_prev, y_prev = initial[0], initial[1]
    error = diff + 1
    n_iter = 0
    while error > diff:
        x_next, y_next = x_prev - alpha*grad(x_prev, y_prev)[0], y_prev - alpha*grad(x_prev, y_prev)[1]
        error = abs(func(x_next, y_next) - func(x_prev, y_prev))
        x_prev, y_prev = x_next, y_next
        n_iter += 1
        plotting(n_iter, a=x_next, b=y_next)
    return [x_prev, y_prev], func(x_prev, y_prev), n_iter

In [None]:
result = GD([-5,5], 0.015, 0.00001)
print ("Point of minimum: ", (round(result[0][0],2), round(result[0][1],2)), 
       "\nMin = ", round(result[1],2), "\nN iterations = ", result[2])

In [None]:
import imageio
images = []
filenames = ['./img/it_{}.png'.format(iteration) for iteration in range(1,469)]
for filename in filenames:
    images.append(imageio.imread(filename))
imageio.mimsave('movie.gif', images)

### Multinomial Regression

In [None]:
X = [1,3,4,2,5,1,5,6]
Y = [1,2,5,5,3,1,1,2]

In [None]:
x = np.array(X).reshape((-1, 1))
y = np.array(Y)

In [None]:
x

In [None]:
y

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X,Y,color='b')
plt.ylim([-7,7])
plt.show()

In [None]:
scores = pd.DataFrame()

In [None]:
model = LinearRegression().fit(x, y)

In [None]:
y_pred_r1 = model.predict(x)

In [None]:
y_pred_r1

In [None]:
r_sq = model.score(x, y)
r_sq

In [None]:
a = model.coef_
b = model.intercept_

In [None]:
print(a,b)

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X,Y,color='b')
x_r1 = np.linspace(0, 7, 10)
y_r1 = a*x_r1 + b
plt.plot(x_r1, y_r1, linestyle='dashed', color='g')
plt.ylim([-7,7])
plt.show()

In [None]:
math.sqrt(mean_squared_error(y, y_pred_r1))

In [None]:
scores = pd.concat([pd.DataFrame({'degree': [1], 'train_rmse':[math.sqrt(mean_squared_error(y, y_pred_r1))]}),scores], axis=0)
scores

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
transformer = PolynomialFeatures(degree=2, include_bias=False)
transformer.fit(x)

In [None]:
x_2 = transformer.transform(x)

In [None]:
model2 = LinearRegression().fit(x_2, y)

In [None]:
model2.coef_

In [None]:
model2.intercept_

In [None]:
a2 = model2.coef_[1]
b2 = model2.coef_[0]
c2 = model2.intercept_

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X,Y,color='b')
x_r2 = np.linspace(0, 7, 100)
y_r2 = a[0]*x_r2+b
y_2 = a2*(x_r2**2) + b2*x_r2 + c2
plt.plot(x_r2, y_r2, linestyle='dashed', color='g')
plt.plot(x_r2, y_2, linestyle='dashed', color='red')
plt.ylim([-7,7])
plt.legend(['linear regression', 'quadratic function', 'data'])
plt.show()

In [None]:
r_sq_2 = model2.score(x_2, y)
r_sq_2

In [None]:
math.sqrt(mean_squared_error(y, model2.predict(x_2)))

In [None]:
scores = pd.concat([scores, pd.DataFrame({'degree': [2], 'train_rmse':[math.sqrt(mean_squared_error(y, model2.predict(x_2)))]})],
                   axis=0, ignore_index=True)
scores

In [None]:
transformer = PolynomialFeatures(degree=5, include_bias=False)
transformer.fit(x)

In [None]:
x_5 = transformer.transform(x)

In [None]:
x_5

In [None]:
model5 = LinearRegression().fit(x_5, y)

In [None]:
model5.coef_

In [None]:
model5.intercept_

In [None]:
a5 = model5.coef_[4]
b5 = model5.coef_[3]
c5 = model5.coef_[2]
d5 = model5.coef_[1]
e5 = model5.coef_[0]
f5 = model5.intercept_

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X,Y,color='b')
x_r5 = np.linspace(0, 7, 100)
y_r5 = a[0]*x_r5+b
y_2 = a2*(x_r5**2) + b2*x_r5 + c2
y_5 = a5*(x_r5**5) + b5*(x_r5**4) + c5*(x_r5**3) + d5*(x_r5**2) + e5*x_r5 + f5
plt.plot(x_r5, y_r5, linestyle='dashed', color='g')
plt.plot(x_r5, y_2, linestyle='dashed', color='red')
plt.plot(x_r5, y_5, linestyle='dashed', color='grey')
plt.legend(['linear regression', 'quadratic function', 'polynomial of 5 degree','data'])
plt.ylim([-7,7])
plt.show()

In [None]:
r_sq_5 = model5.score(x_5, y)
r_sq_5

In [None]:
math.sqrt(mean_squared_error(y, model5.predict(x_5)))

In [None]:
scores = pd.concat([scores, pd.DataFrame({'degree': [5], 'train_rmse':[math.sqrt(mean_squared_error(y, model5.predict(x_5)))]})],
                   axis=0, ignore_index=True)
scores

In [None]:
import statsmodels.api as sm

In [None]:
x_stat = x.copy()
x_stat = sm.add_constant(x)

In [None]:
model_stat = sm.OLS(y,x_stat)

In [None]:
results = model_stat.fit()

In [None]:
results.summary()

In [None]:
model_r = Ridge(alpha=10).fit(x_5, y)

In [None]:
model_r.coef_

In [None]:
model_r.intercept_

In [None]:
ar = model_r.coef_[4]
br = model_r.coef_[3]
cr = model_r.coef_[2]
dr = model_r.coef_[1]
er = model_r.coef_[0]
fr = model_r.intercept_

In [None]:
model_l = Lasso(alpha=10).fit(x_5, y)

In [None]:
model_l.coef_

In [None]:
model_l.intercept_

In [None]:
al = model_l.coef_[4]
bl = model_l.coef_[3]
cl = model_l.coef_[2]
dl = model_l.coef_[1]
el = model_l.coef_[0]
fl = model_l.intercept_

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X,Y,color='b')
x_r5 = np.linspace(0, 7, 100)
y_r5 = a[0]*x_r5+b
y_2 = a2*(x_r5**2) + b2*x_r5 + c2
y_5 = a5*(x_r5**5) + b5*(x_r5**4) + c5*(x_r5**3) + d5*(x_r5**2) + e5*x_r5 + f5
y_r = ar*(x_r5**5) + br*(x_r5**4) + cr*(x_r5**3) + dr*(x_r5**2) + er*x_r5 + fr
y_l = al*(x_r5**5) + bl*(x_r5**4) + cl*(x_r5**3) + dl*(x_r5**2) + el*x_r5 + fl
plt.plot(x_r5, y_r5, linestyle='dashed', color='g')
plt.plot(x_r5, y_2, linestyle='dashed', color='red')
plt.plot(x_r5, y_5, linestyle='dashed', color='grey')
plt.plot(x_r5, y_r, linestyle='dashed', color='magenta')
plt.plot(x_r5, y_l, linestyle='dashed', color='orange')
plt.legend(['linear regression', 'quadratic function', 'polynomial of 5 degree','Ridge', 'Lasso','data'])
plt.ylim([-7,7])
plt.show()

In [None]:
r_sq_r = model_r.score(x_5, y)
r_sq_r

In [None]:
x_test = np.array([1, 3, 6.5]).reshape(-1,1)
y_test = np.array([4, 6, -2])

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X,Y,color='b')
plt.scatter(x_test, y_test, color='black', marker='*')
x_r5 = np.linspace(0, 7, 100)
y_r5 = a[0]*x_r5+b
y_2 = a2*(x_r5**2) + b2*x_r5 + c2
y_5 = a5*(x_r5**5) + b5*(x_r5**4) + c5*(x_r5**3) + d5*(x_r5**2) + e5*x_r5 + f5
y_r = ar*(x_r5**5) + br*(x_r5**4) + cr*(x_r5**3) + dr*(x_r5**2) + er*x_r5 + fr
y_l = al*(x_r5**5) + bl*(x_r5**4) + cl*(x_r5**3) + dl*(x_r5**2) + el*x_r5 + fl
plt.plot(x_r5, y_r5, linestyle='dashed', color='g')
plt.plot(x_r5, y_2, linestyle='dashed', color='red')
plt.plot(x_r5, y_5, linestyle='dashed', color='grey')
plt.plot(x_r5, y_r, linestyle='dashed', color='magenta')
plt.plot(x_r5, y_l, linestyle='dashed', color='orange')
plt.legend(['linear regression', 'quadratic function', 'polynomial of 5 degree','Ridge', 'Lasso','data', 'test'])
plt.ylim([-7,8])
plt.show()

### Problem

1) In a loop find _train_ and _test_ _**rmse**_ for the data and train_data defined above.  
2) Draw a graph of _train_ vs _test_ _**rmse**_ with complexity of model (polynomial degree) as independent variable.  
3) Fit Ridge, Lasso and ElasticNet regression and compare results.  
4) Make conclusions.  
See https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html  
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html    
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html  
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html  

In [None]:
res = pd.DataFrame()
for i in range(1,6):
    transformer = PolynomialFeatures(degree=i, include_bias=False)
    transformer.fit(x)
    x_ = transformer.transform(x)
    x_t = transformer.transform(x_test)
    for Name, Regressor in {'lr' : LinearRegression(), 'ridge': Ridge(alpha=10), 'lasso': Lasso(alpha=10),
                           'elasticnet': ElasticNet(l1_ratio=0.5)}.items():
        model = Regressor.fit(x_, y)
        train_pr = model.predict(x_)
        test_pr = model.predict(x_t)
        res = pd.concat([res, pd.DataFrame({'complexity': [i], 'train_rmse': [math.sqrt(mean_squared_error(y, train_pr))],
                                           'test_rmse': [math.sqrt(mean_squared_error(y_test,test_pr))],
                                           'model':[Name]})], ignore_index=True)

In [None]:
res

In [None]:
def plot_rmse(res=res, mod='lr'):
    plt.figure(figsize=(8,6))
    plt.plot(res[res.model == mod].complexity.values, res[res.model == mod].train_rmse.values, '-ok', color='b')
    plt.plot(res[res.model == mod].complexity.values, res[res.model == mod].test_rmse.values, '-ok', color='r')
    plt.ylim([0,10])
    plt.xlabel('Polynomial degree')
    plt.ylabel('RMSE')
    plt.legend(['train', 'test'])
    plt.title('Perfomance of the {} on train and test data'.format(mod.upper()))
    plt.show()

In [None]:
plot_rmse()

In [None]:
plot_rmse(mod='ridge')

In [None]:
plot_rmse(mod='lasso')

In [None]:
plot_rmse(mod='elasticnet')

### K-nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(x, y) 

In [None]:
test_pr_knn = neigh.predict(x_test)
train_knn = neigh.predict(x)

In [None]:
math.sqrt(mean_squared_error(y_test,test_pr_knn))

In [None]:
math.sqrt(mean_squared_error(y,train_knn))

In [None]:
res_knn = pd.DataFrame()
for i in range(1,5):
    model = KNeighborsRegressor(n_neighbors=i)
    model.fit(x, y) 
    train_pr = model.predict(x)
    test_pr = model.predict(x_test)
    res_knn = pd.concat([res_knn, pd.DataFrame({'k': [i], 'train_rmse': [math.sqrt(mean_squared_error(y, train_pr))],
                                        'test_rmse': [math.sqrt(mean_squared_error(y_test,test_pr))]})], ignore_index=True)

In [None]:
res_knn

In [None]:
x_linsp = np.linspace(0,7,100).reshape(-1,1)
model1 = KNeighborsRegressor(n_neighbors=1).fit(x,y)
y_knn_1 = model1.predict(x_linsp)
model2 = KNeighborsRegressor(n_neighbors=2).fit(x,y)
y_knn_2 = model2.predict(x_linsp)
model3 = KNeighborsRegressor(n_neighbors=3).fit(x,y)
y_knn_3 = model3.predict(x_linsp)
model4 = KNeighborsRegressor(n_neighbors=4).fit(x,y)
y_knn_4 = model4.predict(x_linsp)

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X,Y,color='b')
plt.scatter(x_test, y_test, color='black', marker='*')
x_knn = np.linspace(0, 7, 100)
plt.plot(x_knn, y_knn_1, linestyle='dashed', color='orange')
plt.plot(x_knn, y_knn_2, linestyle='dashed', color='green')
plt.plot(x_knn, y_knn_3, linestyle='dashed', color='red')
plt.plot(x_knn, y_knn_4, linestyle='dashed', color='grey')
plt.legend(['knn-1', 'knn-2', 'knn-3', 'knn-4', 'data', 'test'])
plt.ylim([-5,8])
plt.show()

In [None]:
plt.figure(figsize=(8,6))
plt.plot(res_knn.k.values, res_knn.train_rmse.values, '-ok', color='b')
plt.plot(res_knn.k.values, res_knn.test_rmse.values, '-ok', color='r')
plt.ylim([0,5])
plt.xlabel('# of neighbors (k)')
plt.ylabel('RMSE')
plt.legend(['train', 'test'])
plt.title('Perfomance of the KNN on train and test data')
plt.show()

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dt = DecisionTreeRegressor(max_depth=2)
dt.fit(x, y)
test_pred = dt.predict(x_test)
train_pred = dt.predict(x)
print('rmse on train', math.sqrt(mean_squared_error(y, train_pred)))
print('rmse on test', math.sqrt(mean_squared_error(y_test, test_pred)))

In [None]:
x_linsp = np.linspace(0,7,100).reshape(-1,1)
model1 = DecisionTreeRegressor(max_depth=2).fit(x,y)
y_dt_1 = model1.predict(x_linsp)
model2 = DecisionTreeRegressor(max_depth=3).fit(x,y)
y_dt_2 = model2.predict(x_linsp)
model3 = DecisionTreeRegressor(max_depth=4).fit(x,y)
y_dt_3 = model3.predict(x_linsp)

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X,Y,color='b')
plt.scatter(x_test, y_test, color='black', marker='*')
x_dt = np.linspace(0, 7, 100)
plt.plot(x_dt, y_dt_1, linestyle='dashed', color='orange')
plt.plot(x_dt, y_dt_2, linestyle='dashed', color='green')
plt.plot(x_dt, y_dt_3, linestyle='dashed', color='red')
plt.legend(['max_depth=2', 'max_depth=3', 'max_depth=4', 'data', 'test'])
plt.ylim([-5,8])
plt.show()

In [None]:
res_dt = pd.DataFrame()
for i in range(1,5):
    model = DecisionTreeRegressor(max_depth=i)
    model.fit(x, y) 
    train_pr = model.predict(x)
    test_pr = model.predict(x_test)
    res_dt = pd.concat([res_dt, pd.DataFrame({'max_depth': [i], 'train_rmse': [math.sqrt(mean_squared_error(y, train_pr))],
                                        'test_rmse': [math.sqrt(mean_squared_error(y_test,test_pr))]})], ignore_index=True)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(res_dt.max_depth.values, res_dt.train_rmse.values, '-ok', color='b')
plt.plot(res_dt.max_depth.values, res_dt.test_rmse.values, '-ok', color='r')
plt.ylim([0,5])
plt.xlabel('max_depth')
plt.ylabel('RMSE')
plt.legend(['train', 'test'])
plt.title('Perfomance of the DT on train and test data')
plt.show()

### Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(n_estimators=6, random_state=40)
rf.fit(x, y)
test_pred = rf.predict(x_test)
train_pred = rf.predict(x)
print('rmse on train', math.sqrt(mean_squared_error(y, train_pred)))
print('rmse on test', math.sqrt(mean_squared_error(y_test, test_pred)))

In [None]:
rf1 = RandomForestRegressor(n_estimators=4, max_depth=3).fit(x,y)
y_rf_1 = rf1.predict(x_linsp)
rf2 = RandomForestRegressor(n_estimators=5, max_depth=3).fit(x,y)
y_rf_2 = rf2.predict(x_linsp)
rf3 = RandomForestRegressor(n_estimators=6, max_depth=3).fit(x,y)
y_rf_3 = rf3.predict(x_linsp)

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(X,Y,color='b')
plt.scatter(x_test, y_test, color='black', marker='*')
x_ = np.linspace(0, 7, 100)
plt.plot(x_, y_rf_1, linestyle='dashed', color='orange')
plt.plot(x_, y_rf_2, linestyle='dashed', color='green')
plt.plot(x_, y_rf_3, linestyle='dashed', color='grey')
plt.plot(x_, y_dt_2, linestyle='-', color='red')
plt.legend(['RF_4trees', 'RF_5trees', 'RF_6trees', 'SingleDT', 'data', 'test'])
plt.ylim([-5,8])
plt.show()

In [None]:
res_rf = pd.DataFrame()
for i in (3,5,8,10,20):
    model = RandomForestRegressor(n_estimators=i)
    model.fit(x, y) 
    train_pr = model.predict(x)
    test_pr = model.predict(x_test)
    res_rf = pd.concat([res_rf, pd.DataFrame({'n_estimators': [i], 'train_rmse': [math.sqrt(mean_squared_error(y, train_pr))],
                                        'test_rmse': [math.sqrt(mean_squared_error(y_test,test_pr))]})], ignore_index=True)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(res_rf.n_estimators.values, res_rf.train_rmse.values, '-ok', color='b')
plt.plot(res_rf.n_estimators.values, res_rf.test_rmse.values, '-ok', color='r')
plt.ylim([0,5])
plt.xlabel('n_estimators')
plt.ylabel('RMSE')
plt.legend(['train', 'test'])
plt.title('Perfomance of the RF on train and test data')
plt.show()

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor(n_estimators=50, learning_rate=0.02, max_depth=2)
gbr.fit(x, y)
test_pred = gbr.predict(x_test)
train_pred = gbr.predict(x)
print('rmse on train', math.sqrt(mean_squared_error(y, train_pred)))
print('rmse on test', math.sqrt(mean_squared_error(y_test, test_pred)))

In [None]:
gbr1 =  GradientBoostingRegressor(n_estimators=5, max_depth=2).fit(x,y)
y_gbr_1 = gbr1.predict(x_linsp)
gbr2 =  GradientBoostingRegressor(n_estimators=10, max_depth=2).fit(x,y)
y_gbr_2 = gbr2.predict(x_linsp)
gbr3 =  GradientBoostingRegressor(n_estimators=20, max_depth=2).fit(x,y)
y_gbr_3 = gbr3.predict(x_linsp)
gbr4 =  GradientBoostingRegressor(n_estimators=50, max_depth=2).fit(x,y)
y_gbr_4 = gbr4.predict(x_linsp)

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X,Y,color='b')
plt.scatter(x_test, y_test, color='black', marker='*')
x_ = np.linspace(0, 7, 100)
plt.plot(x_, y_gbr_1, linestyle='dashed', color='orange')
plt.plot(x_, y_gbr_2, linestyle='dashed', color='green')
plt.plot(x_, y_gbr_3, linestyle='dashed', color='grey')
plt.plot(x_, y_gbr_4, linestyle='dashed', color='red')
plt.legend(['5 estimators', '10 estimators', '20 estimators', '50 estimators', 'data', 'test'])
plt.ylim([-5,8])
plt.show()

In [None]:
res_gb = pd.DataFrame()
for i in (3,5,8,15,20,40,60):
    model = GradientBoostingRegressor(n_estimators=i)
    model.fit(x, y) 
    train_pr = model.predict(x)
    test_pr = model.predict(x_test)
    res_gb = pd.concat([res_gb, pd.DataFrame({'n_estimators': [i], 'train_rmse': [math.sqrt(mean_squared_error(y, train_pr))],
                                        'test_rmse': [math.sqrt(mean_squared_error(y_test,test_pr))]})], ignore_index=True)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(res_gb.n_estimators.values, res_gb.train_rmse.values, '-ok', color='b')
plt.plot(res_gb.n_estimators.values, res_gb.test_rmse.values, '-ok', color='r')
plt.ylim([0,5])
plt.xlabel('n_estimators')
plt.ylabel('RMSE')
plt.legend(['train', 'test'])
plt.title('Perfomance of the GB on train and test data')
plt.show()