# Imports

In [1]:
import numpy as np
import os
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

# Making a first model

We saw from basic EDA that the three variables most strongly correlated with `SalePrice` are `Overall Qual`, `Gr Liv Area`, and `Garage Area`. As explanatory variables, they also have a roughly linear relationship with the target. Let's use these to make a simple first model.

In [2]:
# Bring in clean dataframe
filepath = 'datasets/train_clean.csv'
df = pd.read_csv(filepath)

In [3]:
features = ['Overall Qual', 'Gr Liv Area', 'Garage Area']

X = df[features]
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
# Instantiate and fit model.
lr = LinearRegression()
lr.fit(X_train, y_train);

# Train score
print('train:', lr.score(X_train, y_train))

# Test score
print('test:', lr.score(X_test, y_test))

# Cross val score
print('cross val score:', cross_val_score(lr, X_train, y_train, cv=5).mean())

train: 0.7819565824464964
test: 0.7766558862439193
cross val score: 0.7787122981872973


So this appears to be a pretty decent model. Our training R-squared value of `0.7819` means around 78% of the variance in the sale price can be explained by our model (as compared to predicting the mean). Similar test and cross-val scores indicate our model generalizes well to new data.

# Bring in `test_clean.csv`

In [5]:
test = pd.read_csv('datasets/test_clean.csv')

# Making predictions

In [6]:
# Make submissions directory if it doesn't already exist
try:
    os.mkdir('submissions')
except:
    pass

In [7]:
test_preds = lr.predict(test[features])

test_preds_df = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': test_preds
})

test_preds_df.to_csv('submissions/three_var_linmod.csv', index=False)