In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load in dataset
df_housing = pd.read_csv('../Data/train.csv')

In [3]:
# Get predictors that you want
X = df_housing[['GrLivArea','GarageArea','FullBath','YearBuilt','GarageCars']]
X = add_constant(X)
y = 'SalePrice'

In [4]:
# Transform columns to make to them usable for linear regression
df_housing['LogSalePrice'] = np.log(df_housing['SalePrice'])
y = df_housing['LogSalePrice']

In [5]:
# Check for multicollinearity and adjust predictors accordingly
for index,col in enumerate(X.columns):
    if col == 'const': continue
    print(f'{col}: {variance_inflation_factor(X.values, index)}')

GrLivArea: 1.9795279454407528
GarageArea: 4.684423217353367
FullBath: 2.1172692446625874
YearBuilt: 1.6680132350708228
GarageCars: 5.080189099198332


In [6]:
# Drop Garage Cars, because VIF > 5
X = X.drop('GarageCars',axis =1)
X = add_constant(X)

In [7]:
# New VIFs
for index,col in enumerate(X.columns):
    if col == 'const': continue
    print(f'{col}: {variance_inflation_factor(X.values, index)}')
X = X.drop('const',axis =1)

GrLivArea: 1.9755437431250817
GarageArea: 1.603503478105227
FullBath: 2.0857130961157835
YearBuilt: 1.5919235384670054


In [8]:
# Split into test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
# Fit linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
print(f'log(SalePrice) = {lin_reg.intercept_:.4f} ',end ='')
for B,x in zip(lin_reg.coef_,X.columns):
    print(f'+ {B:.4f}*({x})', end='')

log(SalePrice) = 1.4927 + 0.0004*(GrLivArea)+ 0.0004*(GarageArea)+ -0.0083*(FullBath)+ 0.0049*(YearBuilt)

In [10]:
# Make predictions on test data and calculate performace metrics
y_pred = lin_reg.predict(X_test)
print("MSE: %.2f" % mean_squared_error(y_test, y_pred))
print("R^2: %.2f" % r2_score(y_test, y_pred))

MSE: 0.04
R^2: 0.76


In [11]:
# Run model on test.csv
df_housing_test = pd.read_csv('../Data/test.csv')
X_test = df_housing_test[X.columns]
y_pred = lin_reg.predict(X_test.fillna(0))
y_pred = np.exp(y_pred)
submission = pd.DataFrame({'Id': df_housing_test['Id'],'SalePrice': y_pred})
submission

Unnamed: 0,Id,SalePrice
0,1461,140768.800728
1,1462,135970.820370
2,1463,197655.763034
3,1464,195714.031213
4,1465,170628.560136
...,...,...
1454,2915,114861.177418
1455,2916,130334.448280
1456,2917,148267.155733
1457,2918,122261.941651


In [12]:
# Make submission.csv
submission.to_csv('submission.csv', index=False)