In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sqlalchemy import create_engine
from scipy import stats
import statsmodels.api as sm
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
import time
from sklearn.neural_network import MLPClassifier


import warnings
warnings.filterwarnings('ignore')

In [3]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
df = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()


In [4]:
clean_df = df.copy()
clean_df.alley.unique()
clean_df.alley.fillna('None', inplace=True)
clean_df.lotfrontage.fillna(0, inplace=True)
clean_df.masvnrtype.fillna('None', inplace=True)
clean_df.masvnrarea.fillna(0.0, inplace=True)
clean_df.bsmtqual.fillna('NA', inplace=True)
clean_df.bsmtcond.fillna('NA', inplace=True)
clean_df.bsmtexposure.fillna('NA', inplace=True)
clean_df.bsmtfintype1.fillna('NA', inplace=True)
clean_df.bsmtfintype2.fillna('NA', inplace=True)
clean_df.bsmtexposure.fillna('NA', inplace=True)
clean_df.electrical.fillna('SBrkr', inplace=True)
clean_df.fireplacequ.fillna('NA', inplace=True)
clean_df.garagetype.fillna('NA', inplace=True)
clean_df.garageyrblt.fillna('NA', inplace=True)
clean_df.garagefinish.fillna('NA', inplace=True)
clean_df.garagequal.fillna('NA', inplace=True)
clean_df.garagecond.fillna('NA', inplace=True)
clean_df.poolqc.fillna('NA', inplace=True)
clean_df.fence.fillna('NA', inplace=True)
clean_df.miscfeature.fillna('NA', inplace=True)

In [5]:
feature_df = clean_df[['lotarea', 'overallqual']].copy()
bldgtype_list = []
for i in df.bldgtype:
    if i == '2fmCon':
        bldgtype_list.append('Duplex')
    else:
        bldgtype_list.append(i)
clean_df['bldgtype'] = bldgtype_list
x = pd.get_dummies(clean_df.bldgtype, drop_first=True).copy()
for i in range(len(clean_df.bldgtype.unique()) - 1):
    feature_df['bldgtype' + clean_df.bldgtype.unique()[i + 1]] = x.iloc[:,i]

In [6]:
keywords = ['garagequal', 'kitchenqual', 'bsmtcond', 'bsmtqual', 'exterqual' ]
for word in keywords:
    nums = []
    for row in clean_df[word]:
        if row == 'Fa':
            nums.append(1)
        elif row == 'TA':
            nums.append(2)
        elif row == 'Gd':
            nums.append(3)
        elif row == 'Ex':
            nums.append(4)
        else:
            nums.append(0)
    feature_df[word] = nums

In [7]:
feature_df[['garagecars', 'firstflrsf', 'fullbath', 'totrmsabvgrd', 'yearbuilt', 'yearremodadd', 'fireplaces', 'bsmtfinsf1', 'wooddecksf', 'secondflrsf']] = clean_df[['garagecars', 'firstflrsf', 'fullbath', 'totrmsabvgrd', 'yearbuilt', 'yearremodadd', 'fireplaces', 'bsmtfinsf1', 'wooddecksf', 'secondflrsf']]
feature_df['isWDsale'] = x.iloc[:,-1]
feature_df['saleprice'] = clean_df.saleprice

In [15]:
feature_df.head()

Unnamed: 0,lotarea,overallqual,bldgtypeDuplex,bldgtypeTwnhsE,bldgtypeTwnhs,garagequal,kitchenqual,bsmtcond,bsmtqual,exterqual,...,fullbath,totrmsabvgrd,yearbuilt,yearremodadd,fireplaces,bsmtfinsf1,wooddecksf,secondflrsf,isWDsale,saleprice
0,8450,7,0,0,0,2,3,2,3,3,...,2,8,2003,2003,0,706,0,854,0,208500
1,9600,6,0,0,0,2,2,2,3,2,...,2,6,1976,1976,1,978,298,0,0,181500
2,11250,7,0,0,0,2,3,2,3,3,...,2,6,2001,2002,1,486,0,866,0,223500
3,9550,7,0,0,0,2,3,3,2,2,...,1,7,1915,1970,1,216,0,756,0,140000
4,14260,8,0,0,0,2,3,2,3,3,...,2,9,2000,2000,1,655,192,1053,0,250000


In [18]:
med = np.median(feature_df.saleprice)
binary_features = []

for i in feature_df.saleprice:
    if i >= med:
        binary_features.append(1)
    else:
        binary_features.append(0)
        
feature_df['saleprice'] = binary_features

In [24]:
X = feature_df.drop('saleprice', 1)
Y = feature_df.saleprice

start_time = time.time()
rfr = ensemble.RandomForestRegressor(n_estimators=40)
forest_results = cross_val_score(rfr, X, Y, cv=6)
end_time = time.time()
print(round(end_time - start_time, 2), 'Seconds')
print(forest_results)
print(forest_results.sum()/6)

1.15 Seconds
[0.75269993 0.80022428 0.75694831 0.71761128 0.74928686 0.70747599]
0.7473744412598462


In [74]:
start_time = time.time()
mlp = MLPClassifier(hidden_layer_sizes=(200,10), alpha=.001, random_state=1)
results = cross_val_score(mlp, X, Y, cv=6)
end_time = time.time()
print(round(end_time - start_time, 2), 'Seconds')
print(results)
print('Average: ', results.sum()/6)

0.89 Seconds
[0.66803279 0.7295082  0.79835391 0.73251029 0.77777778 0.75720165]
Average:  0.7438974341676223


In [38]:
results = []
for i in range(4):
    for j in range(4):
        mlp = MLPClassifier(hidden_layer_sizes=(i*10+1,j*10+1))
        results.append(cross_val_score(mlp, X, Y, cv=6).sum()/6)

In [75]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=100)

parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [76]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-2, cv=3)
clf.fit(X, Y)

GridSearchCV(cv=3, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,)], 'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam'], 'alpha': [0.0001, 0.05], 'learning_rate': ['constant', 'adaptive']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [77]:
clf.best_params_

{'activation': 'relu',
 'alpha': 0.0001,
 'hidden_layer_sizes': (50, 50, 50),
 'learning_rate': 'constant',
 'solver': 'adam'}

In [86]:
start_time = time.time()
mlp = MLPClassifier(hidden_layer_sizes=(1000,200,50), alpha=.0001)
results = cross_val_score(mlp, X, Y, cv=20)
end_time = time.time()
print(round(end_time - start_time, 2), 'Seconds')
print(results)
print('Average: ', results.sum()/20)

26.04 Seconds
[0.68918919 0.7972973  0.7027027  0.81081081 0.67567568 0.91891892
 0.78378378 0.86486486 0.73972603 0.78082192 0.57534247 0.76712329
 0.73611111 0.76388889 0.73611111 0.72222222 0.625      0.54166667
 0.61111111 0.76388889]
Average:  0.7303128470936691


In [None]:
# Tried the gridsearch results and then played with a bunch of hidden layer sizes but
# couldn't consistentl