## 1. Import libraries and data

In [2]:
# import libraries
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [3]:
# read test and train data, print shapes
test = pd.read_csv('datasets/test.csv')
train = pd.read_csv('datasets/train.csv')
train.shape, test.shape

((2051, 81), (878, 80))

In [4]:
def check_equal_columns(train=train, test=test):
    test_only = [feat for feat in test.columns if feat not in train.columns]
    train_only = [feat for feat in train.columns if feat not in test.columns]
    print('test columns not in train: {}\ntrain columns not in test: {}'.format(test_only, train_only))
check_equal_columns()

test columns not in train: []
train columns not in test: ['SalePrice']


---
### Dummies for Outliers

In [5]:
#train = train.loc[~(train['Garage Yr Blt'] >2020)]
#train = train.loc[~(train['Total Bsmt SF'] >3000)]
#train = train.loc[~(train['1st Flr SF'] >3000)]
#train = train.loc[~(train['Gr Liv Area'] >4000)]
#train = train.loc[~(train['TotRms AbvGrd'] >12)]
#train = train.loc[~(train['Garage Cars'] >4)]

---
### Replace NaNs with zeros

In [6]:
# replace NaNs with zeros
train = train.fillna(0)
test = test.fillna(0)

In [7]:
check_equal_columns()

test columns not in train: []
train columns not in test: ['SalePrice']


### Big Zero Vals

In [8]:
# list columns with >50 zero values
zero_cols = [ col for col in train.columns if len(train[train[col] == 0]) > 50 ]
zce = []
for col in test.columns:
    if col not in zero_cols and (test[col].isna().sum() > 0 or train[col].isna().sum() > 0):
        zce.append(col)
zero_cols = zero_cols + zce

In [9]:

# Zero vals in 'Total Bsmt SF' will give us our 'no basement' col,
# so we can drop other variables containing 'Bsmt':
zero_cols = [ x for x in zero_cols if train[x].dtype == 'float' or train[x].dtype == 'int']

In [9]:
# define function to check for zeros values
def zero_checker(x):
    if x==0 or math.isnan(x):
        return 1
    else:
        return 0

In [10]:
for col in zero_cols:
    train[col+'_is_zero'] = train[col].apply(zero_checker)

In [11]:
for col in zero_cols:
    test[col+'_is_zero'] = test[col].apply(zero_checker)

In [12]:
check_equal_columns()

test columns not in train: []
train columns not in test: ['SalePrice']


Convert ints to floats

In [13]:
for col in test.columns:
    if train[col].dtype != test[col].dtype:
        test[col].astype('float')

### Dealing with nominal categoricals
*pd.get_dummies*

In [14]:
train = pd.get_dummies(train, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

# wow
len(train.columns), len(test.columns) 


(287, 268)

### Dropping columns appearing in only one df

In [15]:
test_uniques = [feat for feat in test.columns if feat not in train.columns]
train_uniques = [feat for feat in train.columns if feat not in test.columns]

# don't remove 'SalePrice' from train:
train_uniques.remove('SalePrice')

In [16]:
test = test.drop(columns=test_uniques)
train = train.drop(columns=train_uniques)
len(train.columns), len(test.columns) 

(259, 258)

# Output

In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
model = LinearRegression()

In [19]:
X = train[features]
y = train['SalePrice']
X_test = test[features]

NameError: name 'features' is not defined

In [None]:
model.fit(X, y)

In [None]:
y_pred = model.predict(X_test)

In [None]:
X.shape, y_pred.shape

In [None]:
output = pd.DataFrame( {'Id': test['Id']} )
output.head()

In [None]:
output['SalePrice'] = y_pred

In [None]:
output.to_csv('submissions/submission_8.csv', index=False)