# Imports

In [1]:
import numpy as np
import os
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

# MLR with a log transformation of our target

Since our MLR with the variables `Overall Qual`, `Gr Liv Area`, and `Garage Area` was pretty good, let's try using it in conjunction with a log transformation of `SalePrice`.

In [2]:
# Bring in clean dataframe
filepath = 'datasets/train_clean.csv'
df = pd.read_csv(filepath)

In [3]:
features = ['Overall Qual', 'Gr Liv Area', 'Garage Area']

X = df[features]
y = df['SalePrice']
y_log = np.log(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, random_state=42)

In [4]:
# Instantiate and fit model
log_lr = LinearRegression()
log_lr.fit(X_train, y_train);

# Train score
print('train:', log_lr.score(X_train, y_train))

# Test score
print('test:', log_lr.score(X_test, y_test))

# Cross val score
print('cross val score:', cross_val_score(log_lr, X_train, y_train, cv=5).mean())

train: 0.7989218714383082
test: 0.7858410438659181
cross val score: 0.7965281691082039


Let's compare these scores to our MLR scores without the log transformation...

In [5]:
no_transform = {
    'train': 0.7819565824464964,
    'test': 0.7766558862439193,
    'cross val score': 0.7787122981872973,
}

transform = {
    'train': 0.75484952631023,
    'test': 0.8270003707237565,
    'cross val score': 0.7491498680671022,
}

for each in transform.keys():
    print(f'{each} difference: {transform[each] - no_transform[each]}')

train difference: -0.027107056136266405
test difference: 0.05034448447983719
cross val score difference: -0.029562430120195105


So we can see we've made improvements in the test score, but not in the training or cross-validation scores.

# Bring in `test_clean.csv`

In [6]:
test = pd.read_csv('datasets/test_clean.csv')

# Making predictions

In [7]:
# Make submissions directory if it doesn't already exist
try:
    os.mkdir('submissions')
except:
    pass

In [8]:
test_preds = np.exp(log_lr.predict(test[features]))

test_preds_df = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': test_preds
})

test_preds_df.to_csv('submissions/log_transform.csv', index=False)