In [1]:
import numpy as np
import pandas as pd

from scipy import stats
import itertools
from sklearn import linear_model
from numpy import ones,vstack
from numpy.linalg import lstsq

In [2]:
df = pd.read_csv("data/Ames_Housing_Price_Data_raw.csv", index_col = 0)

In [3]:
# reset index bc data has rows with same indices, i.e. indices start over at 1 at certain points
df = df.reset_index(drop = True)

# Type dictionaries

In [4]:
typedict = {'PID' : 'nominal',
            'SalePrice' : 'continuous',
            #Matt
            'LotFrontage' : 'continuous', 
            'LotArea' : 'continuous',
            'maybe_LotShape' : 'nominal',
            'LandSlope' : 'nominal', 
            'LandContour' : 'nominal', 
            'maybe_MSZoning' : 'nominal', 
            'Street_paved' : 'nominal', 
            'Alley' : 'nominal',
            'Neighborhood' : 'nominal', 
            'drop_LotConfig' : 'nominal', 
            'drop_Condition1' : 'nominal', 
            'drop_Condition2' : 'nominal',
            'Foundation' : 'nominal',
            'Utilities' : 'nominal',
            'Heating' : 'nominal',
            'HeatingQC_nom' : 'ordinal',
            'CentralAir' : 'nominal',
            'Electrical' : 'nominal',
            'HeatingQC_ord' : 'ordinal',
            'LotShape_com' : 'nominal',
            'MSZoning_com' : 'nominal',
            'LF_Normal' : 'nominal',
            'LF_Near_NS_RR' : 'nominal',
            'LF_Near_Positive_Feature' : 'nominal',
            'LF_Adjacent_Arterial_St' : 'nominal',
            'LF_Near_EW_RR' : 'nominal',
            'LF_Adjacent_Feeder_St' : 'nominal',
            'LF_Near_Postive_Feature' : 'nominal',
            'Heating_com' : 'nominal',
            'Electrical_com' : 'nominal',
            'LotConfig_com' : 'nominal', 
            'LotFrontage_log' : 'continuous',
            'LotArea_log' : 'continuous',
            #Oren 
            'MiscFeature': 'Nominal',
            'Fireplaces': 'Discrete',
            'FireplaceQu': 'Ordinal',
            'PoolQC': 'Ordinal',
            'PoolArea': 'Continuous',
            'PavedDrive': 'Nominal',
            'ExterQual': 'Ordinal',
            'OverallQual': 'Ordinal',
            'drop_OverallCond': 'Ordinal',
            'MiscVal': 'Continuous',
            'YearBuilt': 'Discrete',
            'YearRemodAdd': 'Discrete',
            'KitchenQual': 'Ordinal',
            'Fence': 'Ordinal',
            'RoofStyle': 'Nominal',
            'RoofMatl': 'Nominal',
            'maybe_Exterior1st': 'Nominal',
            'drop_Exterior2nd': 'Nominal',
            'drop_ExterCond': 'Ordinal',
            'maybe_MasVnrType': 'Nominal',
            'MasVnrArea': 'Continuous',
            #Mo
            #Basement
            'BsmtQual_ord': 'Ordinal',
            'BsmtCond_ord': 'Ordinal',
            'BsmtExposure_ord': 'Ordinal',
            'BsmtQual_ord_lin': 'Ordinal',
            'BsmtCond_ord_lin': 'Ordinal',
            'BsmtExposure_ord_lin': 'Ordinal',
            'TotalBsmtSF': 'Continuous',
            'BSMT_GLQ':'Continuous', 
            'BSMT_Rec':'Continuous',
            'maybe_BsmtUnfSF': 'Continuous',
            'maybe_BSMT_ALQ':'Continuous',
            'maybe_BSMT_BLQ':'Continuous', 
            'maybe_BSMT_LwQ':'Continuous', 
            'drop_BsmtQual': 'Nominal',
            'drop_BsmtCond': 'Nominal',
            'drop_BsmtExposure': 'Nominal',
            'drop_BsmtFinType1': 'Nominal',
            'drop_BsmtFinSF1': 'Continuous',
            'drop_BsmtFinType2': 'Nominal',
            'drop_BsmtFinSF2': 'Continuous',
            #Deck
            'WoodDeckSF':'Continuous', 
            'OpenPorchSF':'Continuous', 
            'ScreenPorch':'Continuous',
            'maybe_EnclosedPorch':'Continuous',
            'maybe_3SsnPorch':'Continuous',
            #Garage
            'GarageFinish':'Nominal', 
            'GarageYrBlt':'Continuous',
            'GarageCars':'Ordinal',
            'GarageArea':'Continuous',
            'GarageType_con':'Nominal',
            'maybe_GarageQual':'Nominal', 
            'maybe_GarageCond':'Nominal',
            'drop_GarageType':'Nominal'
}

In [5]:
#Categorization of original variables
general=['PID','SalePrice']
lot_aspects=['LotFrontage','LotArea','LotShape','LandSlope','LandContour']
building_size=['MSSubClass','BldgType','HouseStyle','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd']
location=['MSZoning','Street','Alley','Neighborhood']
location_aspects=['LotConfig','Condition1','Condition2']
amenities=['MiscFeature','Fireplaces','FireplaceQu','PoolQC','PoolArea','PavedDrive']
garage=['GarageFinish','GarageType','GarageYrBlt','GarageCars','GarageArea','GarageQual','GarageCond']
decks=['WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch']
basement=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF']
utilities=['Foundation','Utilities','Heating','HeatingQC','CentralAir','Electrical']
quality_ratings=['ExterQual','OverallQual','OverallCond','MiscVal','YearBuilt','YearRemodAdd','KitchenQual','Fence','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','ExterCond','MasVnrType','MasVnrArea']
sales_aspect=['Functional','SaleCondition','SaleType','MoSold','YrSold']

# Matt

In [6]:
#add log Price column

df['SalePrice_log']=np.log10(df['SalePrice'])

In [7]:
# ordinalize heating quality ratings
HousingQC_dict={
       'Ex':5,
       'Gd':4,
       'TA':3,
       'Fa':2,
       'Po':1,
}

df.loc[df['HeatingQC'].isna(),'HeatingQC']='0'
df['HeatingQC_ord']=df['HeatingQC'].map(lambda x: HousingQC_dict[x])
df.rename(columns={'HeatingQC':'HeatingQC_nom'}, inplace=True)

In [8]:
# LotShape: combine IR2 (moderately irregular) and IR3 (irregular) into 'Irregular' due to small sample sizes
lot_shape_dict = {
    'Reg':'Regular',
    'IR1':'Slightly irregular',
    'IR2':'Irregular',
    'IR3': 'Irregular'
}
df['LotShape_com'] = df['LotShape'].map(lambda x: lot_shape_dict[x] if x in lot_shape_dict else x)
df.rename(columns={'LotShape':'maybe_LotShape'}, inplace=True)

In [9]:
# LandSlope: combine Mod (moderate) and Sev (severe) into 'Moderate-severe' due to small sample sizes
land_slope_dict = {
    'Gtl':'Gentle',
    'Mod':'Moderate-severe',
    'Sev':'Moderate-severe'
}
df['LandSlope'] = df['LandSlope'].map(lambda x: land_slope_dict[x] if x in land_slope_dict else x)

In [10]:
# rename missing values in Alley column to 'No alley access'
alley_dict = {
    'Pave':'Paved',
    'Grvl':'Gravel',
    'No alley access' : 'No alley access'
}
df.loc[df['Alley'].isna(),'Alley'] = 'No alley access'
df['Alley'] = df['Alley'].map(lambda x: alley_dict[x] if x in alley_dict else x)

In [11]:
# simple renaming LandContour values for clarity
LandContour_dict = {
    'Lvl':'Level',
    'Bnk':'Banked (rise from street level to building)',
    'HLS' : 'Hillside (downward slope on both sides)',
    'Low' : 'Depression (upward slope on both sides)'
}

df['LandContour'] = df['LandContour'].map(lambda x: LandContour_dict[x] if x in LandContour_dict else x)

In [12]:
# Combine C(all) (commercial), I(all) (industrial), and A(agr) (agricultural) zoning types into 'Nonresidential' due to
# small sample sizes and the fact that we are focusing on residential sales
MSZoning_dict = {
    'RL':'Residential, low-density',
    'RM':'Residential, medium-density',
    'FV' : 'Residential, village',
    'RH' : 'Residential, high-density',
    'C (all)' : 'Nonresidential',
    'I (all)' : 'Nonresidential',
    'A (agr)' : 'Nonresidential'
}

df['MSZoning_com'] = df['MSZoning'].map(lambda x: MSZoning_dict[x] if x in MSZoning_dict else x)
df.rename(columns={'MSZoning':'maybe_MSZoning'}, inplace=True)

In [13]:
# combine 'Near (within 200 ft)' and 'Adjacent to' into 'Near' for North-South RR, East-West RR, and positive features (parks, greenways, etc)
# renaming them LF_<factor> for Location Factor instead of condition to avoid confusion, as condition is also used to describe
# state of maintenance of various other features in the dataset
Condition_dict = {
    'Norm' : 'LF_Normal',
    'RRAn' : 'LF_Near_NS_RR',
    'PosN' : 'LF_Near_Positive_Feature',
    'Artery' : 'LF_Adjacent_Arterial_St',
    'RRAe' : 'LF_Near_EW_RR',
    'Feedr' : 'LF_Adjacent_Feeder_St',
    'PosA' : 'LF_Near_Postive_Feature',
    'RRNn' : 'LF_Near_NS_RR',
    'RRNe' : 'LF_Near_EW_RR'
}

df['Condition1'] = df['Condition1'].map(lambda x: Condition_dict[x] if x in Condition_dict else x)
df['Condition2'] = df['Condition2'].map(lambda x: Condition_dict[x] if x in Condition_dict else x)

In [14]:
def combine_condition_columns(df, factors):
    '''
    combines the "Condition1_com" and "Condition2_com" columns into a set of dummies for the values in those 2 columns
    '''
    for i in range(0, df.shape[0]):
        for factor in factors:
            if df.loc[i, 'Condition1'] == factor or df.loc[i, 'Condition2'] == factor:
                df.loc[i, f'{factor}'] = '1'
            else:
                df.loc[i, f'{factor}'] = '0'
    return df

In [15]:
loc_factors = Condition_dict.values()
df = combine_condition_columns(df, loc_factors)

In [16]:
# recommend drop condition columns
df.rename(columns={'Condition1':'drop_Condition1'}, inplace=True)
df.rename(columns={'Condition2':'drop_Condition2'}, inplace=True)
df.rename(columns={'LF_Normal':'drop_LF_Normal'}, inplace=True)

In [17]:
df.rename(columns={'Street':'Street_paved'}, inplace=True) # renaming 'Street' to 'Street_paved'

In [18]:
# Utilities: simple renaming for clarity
Utilities_dict = {
    'AllPub':'EGWS',
    'NoSewr':'EGW with septic tank'
}

df['Utilities'] = df['Utilities'].map(lambda x: Utilities_dict[x] if x in Utilities_dict else x)

In [19]:
# combine 'Gravity furnace', 'Other water/steam heating', 'Floor furnace', and 'Wall furnace' into 'Other' due to
# small sample size
Heating_dict = {
    'GasA':'Gas-powered forced-air heating',
    'GasW':'Gas-powered water/steam heating',
    'Grav' : 'Other',
    'OthW' : 'Other',
    'Floor' : 'Other',
    'Wall' : 'Other'
}

df['Heating_com'] = df['Heating'].map(lambda x: Heating_dict[x] if x in Heating_dict else x)

In [20]:
# renaming for clarity and combining FuseP and FuseF categories due to small sample size
# they are also the 2 most undesirable electrical setups as reported by the data dictionary
Electrical_dict = {
    'SBrkr': 'Standard circuit breakers, all Romex wiring',
    'FuseA': '>60 Amp fuse box, all Romex wiring',
    'FuseF' : '60 Amp fuse box, Romex or older wiring',
    'FuseP' : '60 Amp fuse box, Romex or older wiring'
}

df['Electrical_com'] = df['Electrical'].map(lambda x: Electrical_dict[x] if x in Electrical_dict else x)

In [21]:
# combined FR2 (2 sides frontage) and FR3 (3 sides frontage) into 2+ sides frontage due to small sample size
LotConfig_dict = {
    'Inside': 'Inside lot (1 side frontage)',
    'Corner': 'Corner lot',
    'CulDSac' : 'Cul-de-sac lot',
    'FR2' : '2+ sides frontage',
    'FR3' : '2+ sides frontage'
}

df['LotConfig_com'] = df['LotConfig'].map(lambda x: LotConfig_dict[x] if x in LotConfig_dict else x)
df.rename(columns={'LotConfig':'drop_LotConfig'}, inplace=True)

In [22]:
df['LotFrontage_log'] = np.log(df['LotFrontage'])

In [23]:
df['LotArea_log'] = np.log(df['LotArea'])

# Oren

In [24]:
Cond_dict={
       'Ex':5,
       'Gd':4,
       'TA':3,
       'Fa':2,
       'Po':1,
       'NA':0,
        '0':0
}

In [25]:
df.loc[df['ExterQual'].isna(),'ExterQual']='0'
df['ExterQual']=df['ExterQual'].map(lambda x: Cond_dict[x])

df.loc[df['ExterCond'].isna(),'ExterCond']='0'
df['ExterCond']=df['ExterCond'].map(lambda x: Cond_dict[x])

df.loc[df['KitchenQual'].isna(),'KitchenQual']='0'
df['KitchenQual']=df['KitchenQual'].map(lambda x: Cond_dict[x])

df.loc[df['FireplaceQu'].isna(),'FireplaceQu']='0'
df['FireplaceQu']=df['FireplaceQu'].map(lambda x: Cond_dict[x])

In [26]:
Paved_Drive_Dict={
       'Y':'Paved' ,
       'P':'Partial Pavement',
       'N':'Dirt Gravel'
}
df['PavedDrive']=df['PavedDrive'].map(lambda x: Paved_Drive_Dict[x] if x != 'NA' else x)

Fence_Dict={
       'GdPrv':'Good Privacy',
       'MnPrv':'Minimum Privacy',
       'GdWo':'Good Wood',
       'MnWw':'Minimum Wood/Wire',
       'NA':'No Fence'
}
df.loc[df['Fence'].isna(),'Fence'] = 'NA'
df['Fence']=df['Fence'].map(lambda x: Fence_Dict[x])

Misc_Feature_Dict={
       'Elev':'Elevator',
       'Gar2':'2nd Garage',
       'Othr':'Other',
       'Shed':'Shed',
       'TenC':'Tennis Court',
       'NA':'Nothing'
}
df.loc[df['MiscFeature'].isna(),'MiscFeature'] = 'NA'
df['MiscFeature']=df['MiscFeature'].map(lambda x: Misc_Feature_Dict[x])


Roof_Style_Dict={
       'Flat':'Flat',
       'Gable':'Gable',
       'Gambrel':'Gabrel Barn',
       'Hip':'Hip',
       'Mansard':'Mansard',
       'Shed':'Shed'
}
df['RoofStyle']=df['RoofStyle'].map(lambda x: Roof_Style_Dict[x] if x != 'NA' else x)  
    
    
Roof_Matl_Dict={
       'ClyTile':'Clay or Tile',
       'CompShg':'Standard (Composite) Shingle',
       'Membran':'Membrane',
       'Metal':'Metal',
       'Roll':'Roll',
       'Tar&Grv':'Gravel & Tar',
       'WdShake':'Wood Shakes',
       'WdShngl':'Wood Shingles'
}
df['RoofMatl']=df['RoofMatl'].map(lambda x: Roof_Matl_Dict[x] if x != 'NA' else x)    
    
Exterior_Dict={
       'AsbShng':'Asbestos Shingles',
       'AsphShn':'Asphalt Shingles',
       'BrkComm':'Brick Common',
       'BrkFace':'Brick Face',
       'CBlock':'Cinder Block',
       'CemntBd':'Cement Board',
       'CmentBd':'Cement Board',
       'HdBoard':'Hard Board',
       'ImStucc':'Imitation Stucco',
       'MetalSd':'Metal Siding',
       'Other':'Other',
       'Plywood':'Plywood',
       'PreCast':'PreCast',
       'Stone':'Stone',
       'Stucco':'Stucco',
       'VinylSd':'Vinyl Siding',
       'Wd Sdng':'Wood Siding',
       'WdShing':'Wood Shingles',
       'Wd Shng':'Wood Shingles',
        'Brk Cmn':'Brick Common'
}
df['Exterior1st']=df['Exterior1st'].map(lambda x: Exterior_Dict[x] if x != 'NA' else x)
df['Exterior2nd']=df['Exterior2nd'].map(lambda x: Exterior_Dict[x] if x != 'NA' else x)

Mas_Vnr_Type_Dict={
       'BrkCmn':'Brick Common',
       'BrkFace':'Brick Face',
       'CBlock':'Cinder Block',
       'None':'None',
       'Stone':'Stone'
}
df.loc[df['MasVnrType'].isna(),'MasVnrType'] = 'None'
df['MasVnrType']=df['MasVnrType'].map(lambda x: Mas_Vnr_Type_Dict[x] if x != 'NA' else x)

In [27]:
def combine_exterior_columns(df, factors):
    '''
    combines the "Condition1_com" and "Condition2_com" columns into a set of dummies for the values in those 2 columns
    '''
    for i in range(0, df.shape[0]):
        for factor in factors:
            if df.loc[i, 'Exterior1st'] == factor or df.loc[i, 'Exterior2nd'] == factor:
                df.loc[i, f'{factor}'] = '1'
            else:
                df.loc[i, f'{factor}'] = '0'
    return df

In [28]:
df=combine_exterior_columns(df, ['Exterior1st', 'Exterior2nd'])

In [29]:
df.rename(columns={
    # Columns to drop/maybe
    'OverallCond': 'maybe_OverallCond',
    'ExterCond': 'maybe_ExterCond',
    'Exterior2nd':'drop_Exterior2nd',
    'MasVnrType': 'maybe_MasVnrType',  
    'Exterior1st': 'drop_Exterior1st',
}, inplace=True)

# Mo

In [30]:
def variable_selection(x):
    '''
    Iterate throuh all combunaions of variables and linearly regress to find optimal variables to utilize/ drop
    '''

    lm=linear_model.LinearRegression()

    for i in range(1,len(x.columns)):
        scores = {}

        for item in set(itertools.combinations(x.columns, i)):
            lm.fit(x[list(item)], df['SalePrice'])
            scores[item]=lm.score(x[list(item)], df['SalePrice'])

        print(scores[max(scores, key=lambda key: scores[key])])
        print(max(scores, key=lambda key: scores[key]))

In [31]:
#replace nominal with ordinal variables on standard scale with even steps

def linarization_func(var_name):
    '''
    Input: ordinal variable name as string
    Function creates new variable with naming *_lin that linarizes the ordinal scale 
    based on relationship to mean sales
    Variable needs to be part of a dataframe named df, which also includes oclumn 'SalePrice'
    '''

    #linear function between min and max of mean
    meanlist=df[['SalePrice',f'{var_name}']].groupby(f'{var_name}').agg('mean')

    points = [(0,min(meanlist['SalePrice'])),(1,max(meanlist['SalePrice']))]
    x_coords, y_coords = zip(*points)
    A = vstack([x_coords,ones(len(x_coords))]).T
    m, c = lstsq(A, y_coords, rcond=None)[0]

    #loop reassigning x: current mean, future mean(x_pos on lin function)
    dict={}

    dict[min(df[f'{var_name}'].unique())]=0
    dict[max(df[f'{var_name}'].unique())]=1

    for i in df[f'{var_name}'].unique():
        if not i in dict:
            dict[i]=(meanlist.loc[meanlist.index==i,'SalePrice'][i]-c)/m

    #new value mapping dictionary
    df[f'{var_name}_lin']=df[f'{var_name}'].map(lambda x: dict[x])


Basement

In [32]:
master_dict={
       'Ex':5,
       'Gd':4,
       'TA':3,
       'Fa':2,
       'Po':1,
       'NA':0,
        '0':0
}

exp_dict={
       'Gd':4,
       'Av':3,
       'Mn':2,
       'No':1,
       'NA':0,
        '0':0
}

In [33]:
#replace nominal with ordinal variables on standard scale with even steps
df['BsmtCond_ord']=df['BsmtCond']
df.rename(columns = {'BsmtCond': 'drop_BsmtCond'}, inplace=True)
df.loc[df['BsmtCond_ord'].isna(),'BsmtCond_ord']='0'
df['BsmtCond_ord']=df['BsmtCond_ord'].map(lambda x: master_dict[x])

df['BsmtQual_ord']=df['BsmtQual']
df.rename(columns = {'BsmtQual': 'drop_BsmtQual'}, inplace=True)
df.loc[df['BsmtQual_ord'].isna(),'BsmtQual_ord']='0'
df['BsmtQual_ord']=df['BsmtQual_ord'].map(lambda x: master_dict[x])

df['BsmtExposure_ord']=df['BsmtExposure']
df.rename(columns = {'BsmtExposure': 'drop_BsmtExposure'}, inplace=True)
df.loc[df['BsmtExposure_ord'].isna(),'BsmtExposure_ord']='0'
df['BsmtExposure_ord']=df['BsmtExposure_ord'].map(lambda x: exp_dict[x])

#drop 'unf' and 'NaN' dummies from BsmtFinType1 and BsmtFinType2 (unf covered through separate dumym already)
#need to merge dummies for BsmtFinType1 and BsmtFinType2
df['BSMT_GLQ']=0
df['BSMT_ALQ']=0
df['BSMT_BLQ']=0
df['BSMT_LwQ']=0
df['BSMT_Rec']=0

df.loc[df['BsmtFinType1'] == 'GLQ','BSMT_GLQ']=df.loc[df['BsmtFinType1'] == 'GLQ','BsmtFinSF1']
df.loc[df['BsmtFinType2'] == 'GLQ','BSMT_GLQ']=df.loc[df['BsmtFinType2'] == 'GLQ','BsmtFinSF2']

df.loc[df['BsmtFinType1'] == 'ALQ','BSMT_ALQ']=df.loc[df['BsmtFinType1'] == 'ALQ','BsmtFinSF1']
df.loc[df['BsmtFinType2'] == 'ALQ','BSMT_ALQ']=df.loc[df['BsmtFinType2'] == 'ALQ','BsmtFinSF2']

df.loc[df['BsmtFinType1'] == 'BLQ','BSMT_BLQ']=df.loc[df['BsmtFinType1'] == 'BLQ','BsmtFinSF1']
df.loc[df['BsmtFinType2'] == 'BLQ','BSMT_BLQ']=df.loc[df['BsmtFinType2'] == 'BLQ','BsmtFinSF2']

df.loc[df['BsmtFinType1'] == 'LwQ','BSMT_LwQ']=df.loc[df['BsmtFinType1'] == 'LwQ','BsmtFinSF1']
df.loc[df['BsmtFinType2'] == 'LwQ','BSMT_LwQ']=df.loc[df['BsmtFinType2'] == 'LwQ','BsmtFinSF2']

df.loc[df['BsmtFinType1'] == 'Rec','BSMT_Rec']=df.loc[df['BsmtFinType1'] == 'Rec','BsmtFinSF1']
df.loc[df['BsmtFinType2'] == 'Rec','BSMT_Rec']=df.loc[df['BsmtFinType2'] == 'Rec','BsmtFinSF2']

df.rename(columns = {'BsmtFinType1': 'drop_BsmtFinType1','BsmtFinSF1': 'drop_BsmtFinSF1','BsmtFinType2': 'drop_BsmtFinType2','BsmtFinSF2': 'drop_BsmtFinSF2'}, inplace=True)

df.loc[df['TotalBsmtSF'].isna(),'TotalBsmtSF']=0
df.loc[df['BsmtUnfSF'].isna(),'BsmtUnfSF']=0

#further columns I recommend we drop, based on them not having any effect by themselves on predicting sales prices
df.rename(columns = {'BsmtUnfSF': 'maybe_BsmtUnfSF','BSMT_ALQ': 'maybe_BSMT_ALQ','BSMT_BLQ': 'maybe_BSMT_BLQ','BSMT_LwQ': 'maybe_BSMT_LwQ','BsmtExposure': 'maybe_BsmtExposure'}, inplace=True)


Porches/ Decks

In [34]:
df.rename(columns = {'EnclosedPorch': 'maybe_EnclosedPorch','3SsnPorch': 'maybe_3SsnPorch'}, inplace=True)


Garage

In [35]:
#consolidate Garage Types based on better predicitve power and low impact of other types
garagetype={
   'Detchd':'Detchd', 
    'Attchd':'Attchd', 
    'BuiltIn':'BuiltIn', 
    'Basment':'Detchd',  
    '2Types':'Detchd', 
    'CarPort':'Detchd',
    '0':'0'
}

In [36]:
df['GarageType_con']=df['GarageType']
df.rename(columns = {'GarageType': 'drop_GarageType'}, inplace=True)
df.loc[df['GarageType_con'].isna(),'GarageType_con']='0'
df['GarageType_con']=df['GarageType_con'].map(lambda x: garagetype[x])

#drop GarageCond, GarageQual (basically no value, also almost all values are consolidated in one status)
df.rename(columns = {'GarageCond': 'maybe_GarageCond','GarageQual': 'maybe_GarageQual'}, inplace=True)

#keep year, area, Finish, cars as is, all have strong predictive power and do not seem to allow for easy consolidation


## Hao-Wei

There is an all-in-one pack function called `data_cleaning_part_2`.

In [37]:
MB_dict = {
    20: "1-Story",
    30: "1-Story",
    40: "1-Story",
    120: "1-Story",
    45: "1.5-Story",
    50: "1.5-Story",
    150: "1.5-Story",
    60: "2-Story",
    70: "2-Story",
    160: "2-Story",
    75: "2.5-Story",
    80: "SplitMulti",
    180: "SplitMulti",
    190: "2FamConv",
    85: "SptFoyer",
    90: "Duplex"
};
df["MS_coded"] = df["MSSubClass"].apply(lambda x: MB_dict[x])

In [38]:
def calc_floors(feat1, feat2):
    floors = []
    zipped = zip(feat1, feat2)
    for item in zipped: 
        if item[0] in ['1-Story', '1.5-Story', '2-Story', '2.5-Story']:
            ms_coded_dict = {
                '1-Story' : 1,
                '1.5-Story' : 1,
                '2-Story' : 2,
                '2.5-Story' : 2
            }
            floors.append(ms_coded_dict[item[0]])
        else:
            HouseStyle_dict = {
                '1Story' : 1,
                '1.5Fin' : 1,
                '1.5Unf' : 1,
                '2Story' : 2,
                '2.5Fin' : 2,
                '2.5Unf' : 2,
                'SLvl' : 2,
                'SFoyer' : 2
            }
            floors.append(HouseStyle_dict[item[1]])
    return floors

In [39]:
df['number_floors'] = calc_floors(df['MS_coded'], df['HouseStyle'])

In [40]:
def calc_attic(feat1):
    attic = []
    for item in feat1: 
        if not (item in ['1.5Fin', '1.5Unf', '2.5Fin', '2.5Unf']):
            attic.append('No attic')
        else:
            if 'Fin' in item:
                attic.append('Finished')
            if 'Unf' in item:
                attic.append('Unfinished')
    return attic

In [41]:
df['attic'] = calc_attic(df['HouseStyle'])

In [42]:
df['PUD'] = df['MSSubClass'].apply(lambda x: 1 if x in [120, 150, 160, 180] else 0)

In [43]:
df.loc[653,['BldgType']] = '2fmCon'

In [44]:
def data_cleaning_part_2(housing):
    '''
    Input variable:
    housing: a compatible dataframe.
    
    Description:
    Assume that housing is the dataframe directly imported from person2.csv,
    the function fills up the NA values and add some columns Hao-Wei felt necessary.
    For an explanation of the columns, see the dictionary above and the original description file.
    '''
    df = housing.fillna(0);
    # df = df.drop("PID", axis = 1);
    df = df.reset_index(drop= True);
    
    # Ordinal variable handling
    functionality_dict={
        "Typ": 7, # Typical Functionality
        "Min1": 6, # Minor Deductions 1
        "Min2": 5, # Minor Deductions 2
        "Mod": 4, # Moderate Deductions
        "Maj1": 3, # Major Deductions 1
        "Maj2": 2, # Major Deductions 2
        "Sev": 1, # Severely Damaged
        "Sal": 0, # Salvage only
    };
    df.loc[df["Functional"].isna(), "Functional"]='0';
    df["Functional_dis"]=df["Functional"].map(lambda x: functionality_dict[x]);
    df = df.rename(columns = {"Functional": "Functional_ord"}); # 21
    
    # Adding columns with log scales

    temp = pd.DataFrame({"1stFlrSF_log": np.log10(df["1stFlrSF"]),
                        "2ndFlrSF_log": np.log10(df["2ndFlrSF"]+1), # +1 to avoid -inf
                        "GrLivArea_log": np.log10(df["GrLivArea"])});
    df = pd.concat([df, temp], axis = 1); # 24
    
    # Add weight columns for bathrooms
#     half_equiv = [0.3, 0.5];

#     temp_dict = {};
#     for eq in half_equiv:
#         temp_dict["BsmtEqBath_"+"{:.1f}".format(eq)] = df["BsmtFullBath"] + eq*df["BsmtHalfBath"];
#         temp_dict["EqBath_"+"{:.1f}".format(eq)] =  df["FullBath"] + eq*df["HalfBath"];

#     temp = pd.DataFrame(temp_dict);
#     df = pd.concat([df, temp], axis = 1); # 28

    # Extract nominal columns for better interpretation.
#     temp_dict = {};
#     temp_dict["1-Story"]    = df.apply(lambda x: x["MSSubClass"] in [20, 30, 40, 120], axis=1);
#     temp_dict["1.5-Story"]  = df.apply(lambda x: x["MSSubClass"] in [45, 50, 150], axis=1);
#     temp_dict["2-Story"]    = df.apply(lambda x: x["MSSubClass"] in [60, 70, 160], axis=1);
#     temp_dict["2.5-Story"]  = df.apply(lambda x: x["MSSubClass"] == 75, axis=1);
#     temp_dict["SplitMulti"] = df.apply(lambda x: x["MSSubClass"] in [80, 180], axis=1);
#     temp_dict["2FamConv"]   = df.apply(lambda x: x["MSSubClass"] == 190, axis=1);
#     temp_dict["SptFoyer"]   = df.apply(lambda x: x["MSSubClass"] == 85, axis=1);
#     temp_dict["Duplex"]     = df.apply(lambda x: x["MSSubClass"] == 90, axis=1);
#     temp_dict["Unfinished"] = df.apply(lambda x: x["MSSubClass"] == 190, axis=1) | df.apply(lambda x: x["HouseStyle"] in ["1.5Unf", "2.5Unf"], axis=1);
#     temp_dict["PUD"]        = df.apply(lambda x: x["MSSubClass"] in [120, 150, 160, 180], axis=1);
#     temp_dict["1Fam"]       = df.apply(lambda x: x["BldgType"] == "1Fam", axis=1);
#     temp_dict["TwnhsE"]     = df.apply(lambda x: x["BldgType"] == "TwnhsE", axis=1);
#     temp_dict["TwnhsI"]     = df.apply(lambda x: x["BldgType"] == "TwnhsI", axis=1);

    temp = pd.DataFrame(temp_dict).astype(int);
    df = pd.concat([df, temp], axis = 1); # 41, 42 if PID not dropped
    
    # Some of my personal selection
    cols_drop = ["MSSubClass", "HouseStyle"]; # No more "BldgType"
    cols_maybe = ["LowQualFinSF", "BsmtHalfBath", "HalfBath", "MoSold", "YrSold"];
    
    col_dict = {};
    for dr in cols_drop:
        col_dict[dr] = "drop_" + dr;
    for dr in cols_maybe:
        col_dict[dr] = "maybe_" + dr;
    
    df.rename(columns=col_dict, inplace=True);
    
    return df;

In [45]:
def make_datetime(df):
    sold_datetime = []
    for i in range(len(df)-1):
        curr_sold = datetime(year = (df['YrSold'])[i], month = (df['MoSold'])[i], day = 1)
        sold_datetime.append(curr_sold)
    return pd.Series(sold_datetime)

In [46]:
from datetime import datetime
df['sold_datetime'] = make_datetime(df)


In [47]:
#create linearized variables for these three prdinal variables
# linarization_func('BsmtCond_ord')
# linarization_func('BsmtQual_ord')
# linarization_func('BsmtExposure_ord')

In [48]:
cols = [c for c in df.columns if c[0:5] != 'drop_']
df=df[cols]

In [49]:
df.to_csv('./data/ames_housing_price_data_v2.csv', index = False)