In [None]:
import pandas as pd
import numpy as np

In [None]:
file = "kc_house_data.csv"
data = pd.read_csv(file)

In [None]:
feature = data.drop(columns=['price'])

In [None]:
columns = ['bedrooms', 'bathrooms', 
           'sqft_living', 'sqft_lot', 'floors', 
           'waterfront', 'view', 'condition', 
           'grade', 'sqft_above', 'sqft_basement', 
           'yr_built', 'yr_renovated', 
           'lat', 'long', 'sqft_living15', 'sqft_lot15']

## Question 1

In [None]:
for c in columns:
    col = feature[c]
    print(f'Column {c} summary:')
    print(f'- maximum {np.amax(col)}')
    print(f'- minimum {np.amin(col)}')
    print(f'- average {np.average(col)}')
    print(f'- variance {np.var(col)}')
    print('\n')

In [None]:
corr = data.corr()['price']
for c in columns:
    print(f'Correlation price vs. {c}: {corr[c]}')

In [None]:
from sklearn.linear_model import LinearRegression

lreg = LinearRegression()

In [None]:
print(data['price'].shape)
print(feature.shape)

In [None]:
print(feature.info())

In [None]:
train = pd.read_csv('train.csv')
lreg.fit(train.drop(columns=['Unnamed: 0', 'price', 'zipcode']), train['price'])

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

price_predicted = lreg.predict(train.drop(columns=['Unnamed: 0', 'price', 'zipcode']))

mse = mean_squared_error(train['price'], price_predicted)
rse = np.sqrt(mse)
r2 = r2_score(train['price'], price_predicted)

print(f'MSE {mse}')
print(f'RSE {rse}')
print(f'R2 {r2}')

In [None]:
print(f'Coefficients for linear regression\n {lreg.coef_}')
print(f'Coeff length {len(lreg.coef_)}')

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)

In [None]:
scaler.fit_transform(train)

In [None]:
print(scaler.mean_)

In [None]:
print(scaler.var_)

In [None]:
feature = train.drop(columns=['Unnamed: 0', 'price', 'zipcode'])
scaled = scaler.fit_transform(feature)
lreg.fit(feature, train['price'])

In [None]:
predicted = lreg.predict(scaled)

In [None]:
mse = mean_squared_error(predicted, train['price'])
rse = np.sqrt(mse)
r2 = r2_score(predicted, train['price'])

print(f'Scaled MSE {mse}')
print(f'Scaled rse {rse}')
print(f'Scaled R2 {r2}')

In [None]:
help(lreg.fit)

In [None]:
test = pd.read_csv('test.csv')
nonScaled = LinearRegression()
scaled = LinearRegression()
dropped = train.drop(columns=['Unnamed: 0', 'zipcode', 'price'])
targettedY = train['price']

In [None]:
nonScaled.fit(dropped, targettedY)
scaled.fit(scaler.fit_transform(dropped), targettedY)

In [None]:
def printMetrics(linreg):
    testDrop = test.drop(columns=['Unnamed: 0', 'zipcode', 'price', 'date', 'id'])
    testYPredict = linreg.predict(testDrop)
    targettedYTest = test['price']
    
    mse = mean_squared_error(testYPredict, targettedYTest)
    r2 = r2_score(testYPredict, targettedYTest)
    
    print(f'MSE score {mse}')
    print(f'RSE score {np.sqrt(mse)}')
    print(f'R2 score {r2}')

In [None]:
printMetrics(nonScaled)

In [None]:
printMetrics(scaled)

## Question 3: Linear Regression - Closed form

### 3.1 sqft_living and price

In [None]:
x_train = train['sqft_living']
y_train = train['price']

In [None]:
x_hat = sum(x_train)/len(x_train)
y_hat = sum(y_train)/len(y_train)

In [None]:
theta_1 = sum([(x - x_hat)*(y - y_hat) for (x,y) in zip(x_train,y_train)]) / sum([(x - x_hat)**2 for x in x_train])
theta_0 = y_hat - theta_1*x_hat

In [None]:
theta_1, theta_0

In [None]:
x_test = test['sqft_living']
y_test = test['price']

In [None]:
y_pred = x_test * theta_1 + theta_0

In [None]:
mse = mean_squared_error(y_pred, y_test)
r2 = r2_score(y_pred,y_test)

In [None]:
print(f'MSE {mse}')
print(f'RMSE {np.sqrt(mse)}')
print(f'R2 {r2Series(1, index=x_train.index) }')

### 3.2 Multiple linear regression

In [None]:
x_train = train.drop(columns=['Unnamed: 0','price', 'zipcode'])
x_train['bias'] = 1
y_train = np.array([train['price']]).T

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
# x_transpose = np.transpose(x_train)
# p1 = np.linalg.inv(np.matmul(x_transpose, x_train))
# theta = np.matmul(np.matmul(p1, x_transpose), y_train)

transpose = x_train.transpose()
theta = np.linalg.pinv(transpose.dot(x_train)).dot(transpose).dot(y_train)

In [None]:
print(theta.shape)
print(theta.T.shape)

In [None]:
x_test = test.drop(columns=['Unnamed: 0', 'zipcode', 'price', 'date', 'id'])
x_test['bias'] = 1
y_test = test['price']

In [None]:
y_predict = x_test.dot(theta)
mse = mean_squared_error(y_predict, y_test)
rse = np.sqrt(mse)
r2 = r2_score(y_predict, y_test)

In [None]:
print(f'Mean square error {mse}')
print(f'Root mean square error {rse}')
print(f'R2 error {r2}')

## 4 Gradient descent