In [24]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [25]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [26]:
# import data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
Ids = test.Id.values
train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index, inplace=True)
train.reset_index(drop=True, inplace=True)

# combine common features for processing together
combined = train.drop('SalePrice', axis = 1).append(test)
combined.drop('Id', axis = 1, inplace = True)

In [27]:
lessMissing = combined.isnull().sum() * 100 / combined.shape[0] < 15
combined = combined.loc[:, lessMissing]

In [28]:
combined['GarageCars'].fillna(combined['GarageCars'].mode().values[0], inplace = True)
combined['MSZoning'].fillna(combined['MSZoning'].mode().values[0], inplace = True)
combined['Utilities'].fillna(combined['Utilities'].mode().values[0], inplace = True)
combined['BsmtFinSF2'].fillna(combined['BsmtFinSF2'].mean(), inplace = True)
combined['TotalBsmtSF'].fillna(combined['TotalBsmtSF'].mean(), inplace = True)
combined['BsmtHalfBath'].fillna(combined['BsmtHalfBath'].mode().values[0], inplace = True)
combined['GarageArea'].fillna(np.floor(combined['GarageArea'].mean()), inplace = True)
combined['SaleType'].fillna(combined['SaleType'].mode()[0], inplace = True)
combined['Functional'].fillna(combined['Functional'].mode()[0], inplace = True)
combined['BsmtFinSF1'].fillna(combined['BsmtFinSF1'].mean(), inplace = True)
combined['BsmtUnfSF'].fillna(combined['BsmtUnfSF'].mean(), inplace = True)
combined['Electrical'].fillna(combined['Electrical'].mode()[0], inplace = True)
combined['KitchenQual'].fillna('TA', inplace = True)
combined['GarageCond'].fillna(combined['GarageCond'].mode()[0], inplace = True)
combined['GarageQual'].fillna(combined['GarageQual'].mode()[0], inplace = True)
combined['GarageFinish'].fillna(combined['GarageFinish'].mode()[0], inplace = True)
combined['GarageType'].fillna(combined['GarageType'].mode()[0], inplace = True)
combined['BsmtFullBath'].fillna(1.0, inplace = True)
combined['BsmtFinType2'].fillna(combined['BsmtFinType2'].mode()[0], inplace = True)
combined['GarageYrBlt'].fillna(combined['GarageYrBlt'].mode()[0], inplace = True)
combined['MasVnrType'].fillna(combined['MasVnrType'].mode()[0], inplace = True)
combined['MasVnrArea'].fillna(combined['MasVnrArea'].mean(), inplace = True)
combined['Exterior1st'].fillna(combined['Exterior1st'].mode()[0], inplace = True)
combined['Exterior2nd'].fillna(combined['Exterior2nd'].mode()[0], inplace = True)
combined['BsmtCond'].fillna(combined['BsmtCond'].mode()[0], inplace = True)
combined['BsmtExposure'].fillna(combined['BsmtExposure'].mode()[0], inplace = True)
combined['BsmtFinType1'].fillna('GLQ', inplace = True)
combined['BsmtQual'].fillna('Gd', inplace = True)

In [29]:
# convert some columns to categorical
nonCategorical = ['LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch']
minMaxScaler = preprocessing.MinMaxScaler()
le = preprocessing.LabelEncoder()
for column in combined.columns:
    if column in nonCategorical:
        combined[column] = combined[column].astype('float64')
        combined[column] = minMaxScaler.fit_transform(combined[[column]])
    else:
        combined[column] = combined[column].astype('category')
        combined[column] = le.fit_transform(combined[column])

In [30]:
# split combined into train and test again
train['SalePrice'] = train['SalePrice'].astype('float')
train = pd.merge(combined.iloc[0:train.shape[0], :], train.loc[:, ['SalePrice']], on = train.index).drop('key_0', axis = 1)
test = combined.iloc[train.shape[0]:combined.shape[0], :]

In [34]:
significant = ['LotArea', 'OverallQual', 'YearBuilt', 'BsmtQual', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'KitchenQual' ,'GarageType', 'ScreenPorch']

In [37]:
# SVR
svr = SVR(kernel='poly', C=1e3, degree=2)
predictions = svr.fit(train.loc[:, train.columns != 'SalePrice'], train['SalePrice']).predict(test)

In [38]:
submission = pd.DataFrame({'Id': Ids, 'SalePrice': predictions})
submission.to_csv('submission.csv', index = False)

<br>

## K-NN from scratch