# A simple linear regression

Our MLR with the three most highly correlated variables performed reasonably well, so what if we scale back and use an SLR with the single most highly correlated variable (`Overall Qual`)?

In [1]:
import numpy as np
import os
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
# Bring in clean dataframe
filepath = 'datasets/train_clean.csv'
df = pd.read_csv(filepath)

In [3]:
features = ['Overall Qual']

X = df[features]
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
# Instantiate and fit model.
lr = LinearRegression()
lr.fit(X_train, y_train);

# Train score
print('train:', lr.score(X_train, y_train))

# Test score
print('test:', lr.score(X_test, y_test))

# Cross val score
print('cross val score:', cross_val_score(lr, X_train, y_train, cv=5).mean())

train: 0.6538583895447645
test: 0.6191771144580789
cross val score: 0.6501445582644175


Our R-squared is not quite as good as for the MLR, but this model is still decent.

# Bring in `test_clean.csv`

In [5]:
test = pd.read_csv('datasets/test_clean.csv')

# Making predictions

In [6]:
# Make submissions directory if it doesn't already exist
try:
    os.mkdir('submissions')
except:
    pass

In [7]:
test_preds = lr.predict(test[features])

test_preds_df = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': test_preds
})

test_preds_df.to_csv('submissions/SLR.csv', index=False)