In [1]:
# ENVIRONMENT SETUP

# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import datetime
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as lm
from sklearn import preprocessing
from sklearn import metrics

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None) 



In [2]:
original_train_data_d = pd.read_csv('data/train.csv',delimiter=',',header=0)
original_test_data_d = pd.read_csv('data/test.csv',delimiter=',',header=0)

testdev_split_d = (len(original_train_data_d))/2

train_data_d = original_train_data_d.copy()[testdev_split_d:]
train_labels_d = train_data_d["SalePrice"].copy()
train_data_d = train_data_d.drop( ["SalePrice"], axis=1 )

dev_data_d = original_train_data_d.copy()[:testdev_split_d]
dev_labels_d = dev_data_d["SalePrice"].copy()
dev_data_d = dev_data_d.drop( ["SalePrice"], axis=1 )

test_data_d = original_test_data_d.copy()



In [3]:
danish_columns =['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType','MasVnrArea']
for i in danish_columns:
    if (train_data_d[i].dtype != 'float64'):
        print '---- ', i,'-----'
        print 'Missing values: ', train_data_d[i].isnull().sum()
        print 'is of type: ', train_data_d[i].dtype
        print train_data_d[i].value_counts()



----  MSSubClass -----
Missing values:  0
is of type:  int64
20     270
60     145
50      74
120     41
80      34
160     33
90      33
30      32
70      29
85      12
190     11
75       8
180      4
45       2
40       2
Name: MSSubClass, dtype: int64
----  MSZoning -----
Missing values:  0
is of type:  object
RL         589
RM          97
FV          30
RH          10
C (all)      4
Name: MSZoning, dtype: int64
----  LotArea -----
Missing values:  0
is of type:  int64
9600     12
7200     11
6000     11
8400      9
10800     8
9000      7
7500      7
9750      5
9100      5
1680      4
8125      4
4435      4
3182      4
5400      3
10140     3
6240      3
9900      3
10000     3
11700     3
8800      3
7875      3
11500     3
8640      3
13125     3
6600      3
9790      2
9120      2
8100      2
7024      2
12665     2
         ..
2117      1
4712      1
2665      1
11883     1
18030     1
6768      1
8820      1
12919     1
9849      1
11900     1
12925     1
9300      1
10880

In [4]:
train_data_d['MasVnrArea'].fillna(0, inplace=True)
train_data_d['MasVnrArea2'] = train_data_d['MasVnrArea'].astype(int) 
#print train_data_d['MasVnrArea2'].value_counts()
#plt.hist(train_data_d['MasVnrArea2'], bins=range(10,1100, 10))# 0, 50,100,150,200,250,300,350,400 and above

train_data_d['MasVnrArea2_0'] = train_data_d['MasVnrArea2']==0
train_data_d['MasVnrArea2_50'] = (train_data_d['MasVnrArea2']>0) & (train_data_d['MasVnrArea2']<50)
train_data_d['MasVnrArea2_100'] = (train_data_d['MasVnrArea2']>50) & (train_data_d['MasVnrArea2']<100)
train_data_d['MasVnrArea2_150'] = (train_data_d['MasVnrArea2']>100) & (train_data_d['MasVnrArea2']<150)
train_data_d['MasVnrArea2_200'] = (train_data_d['MasVnrArea2']>150) & (train_data_d['MasVnrArea2']<200)
train_data_d['MasVnrArea2_250'] = (train_data_d['MasVnrArea2']>200) & (train_data_d['MasVnrArea2']<250)
train_data_d['MasVnrArea2_300'] = (train_data_d['MasVnrArea2']>250) & (train_data_d['MasVnrArea2']<300)
train_data_d['MasVnrArea2_350'] = (train_data_d['MasVnrArea2']>300) & (train_data_d['MasVnrArea2']<350)
train_data_d['MasVnrArea2_400'] = (train_data_d['MasVnrArea2']>350) & (train_data_d['MasVnrArea2']<400)
train_data_d['MasVnrArea2_other'] = (train_data_d['MasVnrArea2']>400)

In [5]:
train_data_d['MSSubClass_60'] = train_data_d['MSSubClass']==60
train_data_d['MSSubClass_50'] = train_data_d['MSSubClass']==50
train_data_d['MSSubClass_120'] = train_data_d['MSSubClass']==120
train_data_d['MSSubClass_80'] = train_data_d['MSSubClass']==80
train_data_d['MSSubClass_160'] = train_data_d['MSSubClass']==160
train_data_d['MSSubClass_90'] = train_data_d['MSSubClass']==90
train_data_d['MSSubClass_30'] = train_data_d['MSSubClass']==30
train_data_d['MSSubClass_70'] = train_data_d['MSSubClass']==70
train_data_d['MSSubClass_other'] = (train_data_d['MSSubClass']==85) | (train_data_d['MSSubClass']==190) | (train_data_d['MSSubClass']==75)  | (train_data_d['MSSubClass']==180) | (train_data_d['MSSubClass']==45) | (train_data_d['MSSubClass']==40)

train_data_d['LotFrontage_none'] = train_data_d['LotFrontage']==0
train_data_d['LotFrontage_50'] = (train_data_d['LotFrontage']>0) & (train_data_d['LotFrontage']<50)
train_data_d['LotFrontage_60'] = (train_data_d['LotFrontage']>50) & (train_data_d['LotFrontage']<60)
train_data_d['LotFrontage_70'] = (train_data_d['LotFrontage']>60) & (train_data_d['LotFrontage']<70)
train_data_d['LotFrontage_80'] = (train_data_d['LotFrontage']>70) & (train_data_d['LotFrontage']<80)
train_data_d['LotFrontage_90'] = (train_data_d['LotFrontage']>80) & (train_data_d['LotFrontage']<90)
train_data_d['LotFrontage_other'] = train_data_d['LotFrontage']>90

train_data_d['LotArea_5'] = train_data_d['LotArea']<5000
train_data_d['LotArea_6'] = (train_data_d['LotArea']>5000) & (train_data_d['LotArea']<6000)
train_data_d['LotArea_7'] = (train_data_d['LotArea']>6000) & (train_data_d['LotArea']<7000)
train_data_d['LotArea_8'] = (train_data_d['LotArea']>7000) & (train_data_d['LotArea']<8000)
train_data_d['LotArea_9'] = (train_data_d['LotArea']>8000) & (train_data_d['LotArea']<9000)
train_data_d['LotArea_10'] = (train_data_d['LotArea']>9000) & (train_data_d['LotArea']<10000)
train_data_d['LotArea_11'] = (train_data_d['LotArea']>10000) & (train_data_d['LotArea']<11000)
train_data_d['LotArea_12'] = (train_data_d['LotArea']>11000) & (train_data_d['LotArea']<12000)
train_data_d['LotArea_13'] = (train_data_d['LotArea']>12000) & (train_data_d['LotArea']<13000)
train_data_d['LotArea_14'] = (train_data_d['LotArea']>13000) & (train_data_d['LotArea']<14000)
train_data_d['LotArea_other'] = (train_data_d['LotArea']>14000) 

train_data_d['YearBuilt_30'] = train_data_d['YearBuilt']<1930
train_data_d['YearBuilt_40'] = (train_data_d['YearBuilt']>1930) & (train_data_d['YearBuilt']<1940)
train_data_d['YearBuilt_50'] = (train_data_d['YearBuilt']>1940) & (train_data_d['YearBuilt']<1950)
train_data_d['YearBuilt_60'] = (train_data_d['YearBuilt']>1950) & (train_data_d['YearBuilt']<1960)
train_data_d['YearBuilt_70'] = (train_data_d['YearBuilt']>1960) & (train_data_d['YearBuilt']<1970)
train_data_d['YearBuilt_80'] = (train_data_d['YearBuilt']>1970) & (train_data_d['YearBuilt']<1980)
train_data_d['YearBuilt_90'] = (train_data_d['YearBuilt']>1980) & (train_data_d['YearBuilt']<1990)
train_data_d['YearBuilt_00'] = (train_data_d['YearBuilt']>1990) & (train_data_d['YearBuilt']<2005)

train_data_d['YearRemodAdd_50'] = (train_data_d['YearRemodAdd']>1950) & (train_data_d['YearRemodAdd']<1960)
train_data_d['YearRemodAdd_60'] = (train_data_d['YearRemodAdd']>1960) & (train_data_d['YearRemodAdd']<1970)
train_data_d['YearRemodAdd_70'] = (train_data_d['YearRemodAdd']>1970) & (train_data_d['YearRemodAdd']<1980)
train_data_d['YearRemodAdd_80'] = (train_data_d['YearRemodAdd']>1980) & (train_data_d['YearRemodAdd']<1990)
train_data_d['YearRemodAdd_90'] = (train_data_d['YearRemodAdd']>1990) & (train_data_d['YearRemodAdd']<2000)
train_data_d['YearRemodAdd_00'] = (train_data_d['YearRemodAdd']>2000)

train_data_d['MasVnrArea2_0'] = train_data_d['MasVnrArea2']==0
train_data_d['MasVnrArea2_50'] = (train_data_d['MasVnrArea2']>0) & (train_data_d['MasVnrArea2']<50)
train_data_d['MasVnrArea2_100'] = (train_data_d['MasVnrArea2']>50) & (train_data_d['MasVnrArea2']<100)
train_data_d['MasVnrArea2_150'] = (train_data_d['MasVnrArea2']>100) & (train_data_d['MasVnrArea2']<150)
train_data_d['MasVnrArea2_200'] = (train_data_d['MasVnrArea2']>150) & (train_data_d['MasVnrArea2']<200)
train_data_d['MasVnrArea2_250'] = (train_data_d['MasVnrArea2']>200) & (train_data_d['MasVnrArea2']<250)
train_data_d['MasVnrArea2_300'] = (train_data_d['MasVnrArea2']>250) & (train_data_d['MasVnrArea2']<300)
train_data_d['MasVnrArea2_350'] = (train_data_d['MasVnrArea2']>300) & (train_data_d['MasVnrArea2']<350)
train_data_d['MasVnrArea2_400'] = (train_data_d['MasVnrArea2']>350) & (train_data_d['MasVnrArea2']<400)
train_data_d['MasVnrArea2_other'] = (train_data_d['MasVnrArea2']>400)

In [6]:
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['MasVnrType'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['Exterior1st'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['RoofStyle'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['OverallCond'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['OverallQual'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['LotConfig'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['BldgType'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['LotShape'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['Alley'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['HouseStyle'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['MSZoning'])], axis=1)
train_data_d = pd.concat([train_data_d, pd.get_dummies(train_data_d['Neighborhood'])], axis=1)

In [None]:
train_data_d.to_csv('danish_variables_4_2_17.csv')