## Simple Linear Regression

In [89]:
# import libraries
import pandas as pd
import numpy as np
filename = '../data/Boligdata.csv'

In [90]:
# read the csv file into DataFrame
df = pd.read_csv(filename, sep=';', decimal=',', header=0, names=None)
df.head()

Unnamed: 0,Antall Rom,Kvadratmeter,Sted,Pris i MNOK
0,1,30,Blystadlia,1
1,2,85,Fjellhamar,3
2,4,125,Kurland,3
3,3,90,Skårer,2
4,5,150,Rasta,5


In [91]:
df.shape

(8, 4)

In [92]:
df.describe()

Unnamed: 0,Antall Rom,Kvadratmeter,Pris i MNOK
count,8.0,8.0,8.0
mean,3.25,96.125,3.5
std,1.832251,41.817418,1.511858
min,1.0,30.0,1.0
25%,1.75,75.0,2.75
50%,3.5,104.5,3.5
75%,4.25,125.0,5.0
max,6.0,150.0,5.0


In [93]:
#create table for input features and ground truth
#we dont use sted since string, needs to be converted into numerical values
cdf = df[['Antall Rom', 'Kvadratmeter', 'Pris i MNOK']]
X =  cdf['Kvadratmeter']
y = cdf['Pris i MNOK']

X = np.array(X).reshape(-1, 1)
y = np.array(y).reshape(-1, 1)
print("shape of feature data", X.shape, "shape of ground truth data", y.shape)

shape of feature data (8, 1) shape of ground truth data (8, 1)


## Linear Regression

In [97]:
# Create the linear regression model.
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

model = LinearRegression()# ordinary least Square Regression
model.fit(X_train, y_train)
print ('Coefficients: ', model.coef_)
print ('Intercept: ',model.intercept_)


Coefficients:  [[0.03168071]]
Intercept:  [-0.0307883]


In [98]:
y_pred_dummy = model.predict(X_train)
y_pred = model.predict(X_test)

In [99]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

# The mean absolute error
print('Mean absolute error: %.2f'
      % mean_absolute_error(y_test, y_pred))


# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination(r2): %.2f'
      % r2_score(y_test, y_pred))

Mean squared error: 6.54
Mean absolute error: 1.94
Coefficient of determination(r2): 0.00


## MLR

In [100]:
cdf.columns

Index(['Antall Rom', 'Kvadratmeter', 'Pris i MNOK'], dtype='object')

In [101]:
cdf_MLR = cdf[['Antall Rom', 'Kvadratmeter', 'Pris i MNOK']]
cdf_MLR.head()

Unnamed: 0,Antall Rom,Kvadratmeter,Pris i MNOK
0,1,30,1
1,2,85,3
2,4,125,3
3,3,90,2
4,5,150,5


In [102]:
X_mlr = cdf_MLR.drop("Pris i MNOK", axis = 1)
y_mlr = cdf_MLR['Pris i MNOK']

In [103]:
X_mlr.shape

(8, 2)

In [116]:
X_mlr_train, X_mlr_test, y_mlr_train, y_mlr_test = train_test_split(X_mlr, y_mlr, test_size=0.2, random_state=4)
print ('Train set:', X_mlr_train.shape,  y_mlr_train.shape)
print ('Test set:', X_mlr_test.shape,  y_mlr_test.shape)

Train set: (6, 2) (6,)
Test set: (2, 2) (2,)


In [117]:
ML_regr = LinearRegression()
ML_regr.fit(X_mlr_train, y_mlr_train)
# The coefficients
print ('Coefficients: ', ML_regr.coef_)
print ('Intercept: ', ML_regr.intercept_)

Coefficients:  [0.546574   0.00967684]
Intercept:  0.2523354042819461


In [118]:
y_mlr_pred = ML_regr.predict(X_mlr_test)

In [119]:
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_mlr_test, y_mlr_pred))

# The mean absolute error
print('Mean absolute error: %.2f'
      % mean_absolute_error(y_mlr_test, y_mlr_pred))


# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination(r2): %.2f'
      % r2_score(y_mlr_test, y_mlr_pred))

Mean squared error: 7.25
Mean absolute error: 2.16
Coefficient of determination(r2): 0.00


## Polynomial

In [85]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_mlr_train)
X_test_poly = poly.fit_transform(X_mlr_test)
X_train_poly

array([[1.0000e+00, 1.0000e+00, 3.0000e+01, 1.0000e+00, 3.0000e+01,
        9.0000e+02],
       [1.0000e+00, 6.0000e+00, 1.1900e+02, 3.6000e+01, 7.1400e+02,
        1.4161e+04],
       [1.0000e+00, 4.0000e+00, 1.2500e+02, 1.6000e+01, 5.0000e+02,
        1.5625e+04],
       [1.0000e+00, 1.0000e+00, 4.5000e+01, 1.0000e+00, 4.5000e+01,
        2.0250e+03],
       [1.0000e+00, 2.0000e+00, 8.5000e+01, 4.0000e+00, 1.7000e+02,
        7.2250e+03],
       [1.0000e+00, 3.0000e+00, 9.0000e+01, 9.0000e+00, 2.7000e+02,
        8.1000e+03]])

In [86]:
poly_LR = LinearRegression()
poly_LR.fit(X_train_poly, y_train)
# The coefficients
print ('Coefficients: ', poly_LR.coef_)
print ('Intercept: ',poly_LR.intercept_)

Coefficients:  [[ 0.          5.55495414 -0.37855962  2.73981447 -0.26289285  0.00677492]]
Intercept:  [9.85137586]


In [87]:
y_pred_poly = poly_LR.predict(X_test_poly)

In [88]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred_poly))

# The mean absolute error
print('Mean absolute error: %.2f'
      % mean_absolute_error(y_test, y_pred_poly))


# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination(r2): %.2f'
      % r2_score(y_test, y_pred_poly))

Mean squared error: 0.58
Mean absolute error: 0.70
Coefficient of determination(r2): 0.74
