In [89]:
# Loading in libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

In [90]:
# Reading in train_set csv
train = pd.read_csv('train_set.csv')

In [91]:
# Inspect train_set
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,...,patioSF,Prop_Addr,Full_Adr,gcode,lat,long,dist,zip,tract,income
0,1971,1971,535180070,1236,134450,20,RL,72.0,10011,Pave,...,0,210 21ST ST,"210 21ST ST, Ames, IA","210, 21st Street, Broadmoor, Ames, Story Count...",42.042894,-93.612641,3.326988,50010.0,19169000000.0,52375.0
1,1357,1357,908102040,2200,145900,90,RL,67.0,8777,Pave,...,70,234 SOUTH DAKOTA AVE 236,"234 SOUTH DAKOTA AVE 236, Ames, IA","234, South Dakota Avenue, Ames, Story County, ...",42.020564,-93.678868,2.760245,50014.0,19169000000.0,33311.0
2,357,357,906380170,1220,194000,20,RL,64.0,7406,Pave,...,159,5327 DURANT ST,"5327 DURANT ST, Ames, IA","5327, Durant Street, Sunset Ridge, Ames, Story...",42.024622,-93.691352,3.713174,50014.0,19169000000.0,60833.0
3,65,65,909250060,2447,169000,50,RL,50.0,9638,Pave,...,430,2320 KNAPP ST,"2320 KNAPP ST, Ames, IA","2320, Knapp Street, Union Drive, Ames, Story C...",42.019084,-93.646893,0.847709,50014.0,19169000000.0,20878.0
4,1094,1094,534276180,1074,124000,20,RL,74.0,7450,Pave,...,72,2027 FERNDALE AVE,"2027 FERNDALE AVE, Ames, IA","2027, Ferndale Avenue, Melrose Park, Ames, Sto...",42.042544,-93.621391,2.721892,50010.0,19169000000.0,61991.0


In [92]:
# Checking original shape
train.shape

(1984, 42)

In [93]:
# Adding dummy variable columns for MSZoning, Keeping MSZoning column as well
dummies = pd.get_dummies(train, prefix=['MSZoning'], columns = ['MSZoning'], drop_first = True)
dummies = dummies[['PID','MSZoning_RH','MSZoning_RL','MSZoning_RM']]
train = train.merge(dummies, left_on = 'PID', right_on = 'PID')

In [94]:
# Should see 3 col increase, no row decrease
train.shape

(1984, 45)

In [95]:
# Creating function to see if Condition1 or Condition2 shows proximity to RR
def near_rr(df):
    rr = ['RRAe', 'RRAn', 'RRNn','RRNe']
    if df['Condition1'] in rr:
        return 1
    if df['Condition2'] in rr:
        return 1
    else:
        return 0

In [96]:
# Creating near RR column
train['NearRR'] = train.apply(near_rr, axis =1)

In [97]:
# Should see 1 col increase, no row decrease
train.shape

(1984, 46)

In [98]:
# Creating function to see if Condition1 or Condition2 shows proximity to Positive Feature
def near_pos(df):
    pos = ['PosA', 'PosN']
    if df['Condition1'] in pos:
        return 1
    if df['Condition2'] in pos:
        return 1
    else:
        return 0

In [99]:
# Creating near Positive Feature column
train['NearPos'] = train.apply(near_pos, axis = 1)

In [100]:
# Should see 1 col increase, no row decrease
train.shape

(1984, 47)

In [101]:
# Creating function to see if Condition1 or Condition2 shows house is adjacent to arterial road
def near_art(df):
    art = ['Artery']
    if df['Condition1'] in art:
        return 1
    if df['Condition2'] in art:
        return 1
    else:
        return 0

In [102]:
# Creating adjacent to arterial road column
train['Artery'] = train.apply(near_art, axis = 1)

In [103]:
# Should see 1 col increase, no row decrease
train.shape

(1984, 48)

In [105]:
# Function to converting ordinal KitchenQual to number
def qual_to_num_kit(df):
    if df['KitchenQual'] == 'Po':
        return 1
    if df['KitchenQual'] == 'Fa':
        return 2
    if df['KitchenQual'] == 'TA':
        return 3
    if df['KitchenQual'] == 'Gd':
        return 4
    if df['KitchenQual'] == 'Ex':
        return 5

In [106]:
# Replacing Kitchen Qual string values with numerical
train['KitchenQual'] = train.apply(qual_to_num_kit, axis = 1)

In [111]:
# Should see no col increase
train.shape

(1984, 48)