In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

### Loading Data

In [62]:
raw_data = pd.read_csv('../datasets/real_estate_price_size_year.csv')
raw_data

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009
...,...,...,...
95,252460.400,549.80,2009
96,310522.592,1037.44,2009
97,383635.568,1504.75,2006
98,225145.248,648.29,2015


In [63]:
x = raw_data[['size','year']]
y = raw_data['price']

In [64]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

In [65]:
reg = LinearRegression()
reg.fit(x_scaled,y)

In [66]:
print('R-squared: {}'.format(reg.score(x_scaled,y)))

def adj_r2(x,y):
    r2 = reg.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

print('Adjusted R-squared: {}'.format(adj_r2(x_scaled,y)))

from sklearn.feature_selection import f_regression

f_statistic, p_values = f_regression(x_scaled,y)

print('Intercept: {}'.format(reg.intercept_))

for i in range(len(reg.coef_)):
    print('{} Coefficient: {} | p-value: {}'.format(x.columns.values[i], reg.coef_[i], p_values[i].round(3)))



R-squared: 0.7764803683276793
Adjusted R-squared: 0.77187171612825
Intercept: 292289.4701599997
size Coefficient: 67501.57614152209 | p-value: 0.0
year Coefficient: 13724.397082308173 | p-value: 0.357


'year' is not statistically significant, so we shall remove it from the model.

In [67]:
x = raw_data[['size']]
y = raw_data['price']

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

reg = LinearRegression()
reg.fit(x_scaled,y)

### Summary

In [68]:
print('R-squared: {}'.format(reg.score(x_scaled,y)))

def adj_r2(x,y):
    r2 = reg.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

print('Adjusted R-squared: {}'.format(adj_r2(x_scaled,y)))

from sklearn.feature_selection import f_regression

f_statistic, p_values = f_regression(x_scaled,y)

print('Intercept: {}'.format(reg.intercept_))

for i in range(len(reg.coef_)):
    print('{} Coefficient: {} | p-value: {}'.format(x.columns.values[i], reg.coef_[i], p_values[i].round(3)))



R-squared: 0.7447391865847586
Adjusted R-squared: 0.742134484407052
Intercept: 292289.4701599999
size Coefficient: 66161.00300583152 | p-value: 0.0


### Summary Table

In [74]:
reg_summary = pd.DataFrame([['Bias'],['size']], columns=['Features']) # Intercept == Bias
reg_summary['Weights'] = reg.intercept_, reg.coef_[0]
reg_summary

Unnamed: 0,Features,Weights
0,Bias,292289.47016
1,size,66161.003006


### Predictions

In [70]:
prediction_data = [750]
predictions = reg.predict([prediction_data])
predictions[0]

49913041.72453364