In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("/data/Ames_Housing_Price_Data_raw.csv")

In [None]:
# reset index bc data has rows with same indices, i.e. indices start over at 1 at certain points
df = df.reset_index(drop = True)

# Type dictionaries

In [None]:
typedict_matt = {'PID' : 'nominal',
            'SalePrice' : 'continuous', 
            'LotFrontage' : 'continuous', 
            'LotArea' : 'continuous',
            'maybe_LotShape' : 'nominal',
            'LandSlope' : 'nominal', 
            'LandContour' : 'nominal', 
            'maybe_MSZoning' : 'nominal', 
            'Street_paved' : 'nominal', 
            'Alley' : 'nominal',
            'Neighborhood' : 'nominal', 
            'drop_LotConfig' : 'nominal', 
            'drop_Condition1' : 'nominal', 
            'drop_Condition2' : 'nominal',
            'Foundation' : 'nominal',
            'Utilities' : 'nominal',
            'Heating' : 'nominal',
            'HeatingQC_nom' : 'ordinal',
            'CentralAir' : 'nominal',
            'Electrical' : 'nominal',
            'HeatingQC_ord' : 'ordinal',
            'LotShape_com' : 'nominal',
            'MSZoning_com' : 'nominal',
            'LF_Normal' : 'nominal',
            'LF_Near_NS_RR' : 'nominal',
            'LF_Near_Positive_Feature' : 'nominal',
            'LF_Adjacent_Arterial_St' : 'nominal',
            'LF_Near_EW_RR' : 'nominal',
            'LF_Adjacent_Feeder_St' : 'nominal',
            'LF_Near_Postive_Feature' : 'nominal',
            'Heating_com' : 'nominal',
            'Electrical_com' : 'nominal',
            'LotConfig_com' : 'nominal', 
            'LotFrontage_log' : 'continuous',
            'LotArea_log' : 'continuous'
}

# Matt

In [None]:
# ordinalize heating quality ratings
HousingQC_dict={
       'Ex':5,
       'Gd':4,
       'TA':3,
       'Fa':2,
       'Po':1,
}

df.loc[df['HeatingQC'].isna(),'HeatingQC']='0'
df['HeatingQC_ord']=df['HeatingQC'].map(lambda x: HousingQC_dict[x])
df.rename(columns={'HeatingQC':'HeatingQC_nom'}, inplace=True)

In [None]:
# LotShape: combine IR2 (moderately irregular) and IR3 (irregular) into 'Irregular' due to small sample sizes
lot_shape_dict = {
    'Reg':'Regular',
    'IR1':'Slightly irregular',
    'IR2':'Irregular',
    'IR3': 'Irregular'
}
df['LotShape_com'] = df['LotShape'].map(lambda x: lot_shape_dict[x] if x in lot_shape_dict else x)
df.rename(columns={'LotShape':'maybe_LotShape'}, inplace=True)

In [None]:
# LandSlope: combine Mod (moderate) and Sev (severe) into 'Moderate-severe' due to small sample sizes
land_slope_dict = {
    'Gtl':'Gentle',
    'Mod':'Moderate-severe',
    'Sev':'Moderate-severe'
}
df['LandSlope'] = df['LandSlope'].map(lambda x: land_slope_dict[x] if x in land_slope_dict else x)

In [None]:
# rename missing values in Alley column to 'No alley access'
alley_dict = {
    'Pave':'Paved',
    'Grvl':'Gravel',
    'No alley access' : 'No alley access'
}
df.loc[df['Alley'].isna(),'Alley'] = 'No alley access'
df['Alley'] = df['Alley'].map(lambda x: alley_dict[x] if x in alley_dict else x)

In [None]:
# simple renaming LandContour values for clarity
LandContour_dict = {
    'Lvl':'Level',
    'Bnk':'Banked (rise from street level to building)',
    'HLS' : 'Hillside (downward slope on both sides)',
    'Low' : 'Depression (upward slope on both sides)'
}

df['LandContour'] = df['LandContour'].map(lambda x: LandContour_dict[x] if x in LandContour_dict else x)

In [None]:
# Combine C(all) (commercial), I(all) (industrial), and A(agr) (agricultural) zoning types into 'Nonresidential' due to
# small sample sizes and the fact that we are focusing on residential sales
MSZoning_dict = {
    'RL':'Residential, low-density',
    'RM':'Residential, medium-density',
    'FV' : 'Residential, village',
    'RH' : 'Residential, high-density',
    'C (all)' : 'Nonresidential',
    'I (all)' : 'Nonresidential',
    'A (agr)' : 'Nonresidential'
}

df['MSZoning_com'] = df['MSZoning'].map(lambda x: MSZoning_dict[x] if x in MSZoning_dict else x)
df.rename(columns={'MSZoning':'maybe_MSZoning'}, inplace=True)

In [None]:
# combine 'Near (within 200 ft)' and 'Adjacent to' into 'Near' for North-South RR, East-West RR, and positive features (parks, greenways, etc)
# renaming them LF_<factor> for Location Factor instead of condition to avoid confusion, as condition is also used to describe
# state of maintenance of various other features in the dataset
Condition_dict = {
    'Norm' : 'LF_Normal',
    'RRAn' : 'LF_Near_NS_RR',
    'PosN' : 'LF_Near_Positive_Feature',
    'Artery' : 'LF_Adjacent_Arterial_St',
    'RRAe' : 'LF_Near_EW_RR',
    'Feedr' : 'LF_Adjacent_Feeder_St',
    'PosA' : 'LF_Near_Postive_Feature',
    'RRNn' : 'LF_Near_NS_RR',
    'RRNe' : 'LF_Near_EW_RR'
}

df['Condition1'] = df['Condition1'].map(lambda x: Condition_dict[x] if x in Condition_dict else x)
df['Condition2'] = df['Condition2'].map(lambda x: Condition_dict[x] if x in Condition_dict else x)

In [None]:
def combine_condition_columns(df, factors):
    '''
    combines the "Condition1_com" and "Condition2_com" columns into a set of dummies for the values in those 2 columns
    '''
    for i in range(0, df.shape[0]):
        for factor in factors:
            if df.loc[i, 'Condition1'] == factor or df.loc[i, 'Condition2'] == factor:
                df.loc[i, f'{factor}'] = '1'
            else:
                df.loc[i, f'{factor}'] = '0'
    return df

In [None]:
loc_factors = Condition_dict.values()
df = combine_condition_columns(df, loc_factors)

In [None]:
# recommend drop condition columns
df.rename(columns={'Condition1':'drop_Condition1'}, inplace=True)
df.rename(columns={'Condition2':'drop_Condition2'}, inplace=True)

In [None]:
df.rename(columns={'Street':'Street_paved'}, inplace=True) # renaming 'Street' to 'Street_paved'

In [None]:
# Utilities: simple renaming for clarity
Utilities_dict = {
    'AllPub':'EGWS',
    'NoSewr':'EGW with septic tank'
}

df['Utilities'] = df['Utilities'].map(lambda x: Utilities_dict[x] if x in Utilities_dict else x)

In [None]:
# combine 'Gravity furnace', 'Other water/steam heating', 'Floor furnace', and 'Wall furnace' into 'Other' due to
# small sample size
Heating_dict = {
    'GasA':'Gas-powered forced-air heating',
    'GasW':'Gas-powered water/steam heating',
    'Grav' : 'Other',
    'OthW' : 'Other',
    'Floor' : 'Other',
    'Wall' : 'Other'
}

df['Heating_com'] = df['Heating'].map(lambda x: Heating_dict[x] if x in Heating_dict else x)

In [None]:
# renaming for clarity and combining FuseP and FuseF categories due to small sample size
# they are also the 2 most undesirable electrical setups as reported by the data dictionary
Electrical_dict = {
    'SBrkr': 'Standard circuit breakers, all Romex wiring',
    'FuseA': '>60 Amp fuse box, all Romex wiring',
    'FuseF' : '60 Amp fuse box, Romex or older wiring',
    'FuseP' : '60 Amp fuse box, Romex or older wiring'
}

df['Electrical_com'] = df['Electrical'].map(lambda x: Electrical_dict[x] if x in Electrical_dict else x)

In [None]:
# combined FR2 (2 sides frontage) and FR3 (3 sides frontage) into 2+ sides frontage due to small sample size
LotConfig_dict = {
    'Inside': 'Inside lot (1 side frontage)',
    'Corner': 'Corner lot',
    'CulDSac' : 'Cul-de-sac lot',
    'FR2' : '2+ sides frontage',
    'FR3' : '2+ sides frontage'
}

df['LotConfig_com'] = df['LotConfig'].map(lambda x: LotConfig_dict[x] if x in LotConfig_dict else x)
df.rename(columns={'LotConfig':'drop_LotConfig'}, inplace=True)

In [None]:
df['LotFrontage_log'] = np.log(df['LotFrontage'])

In [None]:
df['LotArea_log'] = np.log(df['LotArea'])

# Oren

# Hao-Wei

# Mo

In [None]:
Cond_dict={
       'Ex':5,
       'Gd':4,
       'TA':3,
       'Fa':2,
       'Po':1,
       'NA':0,
        '0':0
}

In [None]:
df.loc[df['ExterQual'].isna(),'ExterQual']='0'
df['ExterQual']=df['ExterQual'].map(lambda x: Cond_dict[x])

df.loc[df['ExterCond'].isna(),'ExterCond']='0'
df['ExterCond']=df['ExterCond'].map(lambda x: Cond_dict[x])

df.loc[df['KitchenQual'].isna(),'KitchenQual']='0'
df['KitchenQual']=df['KitchenQual'].map(lambda x: Cond_dict[x])

df.loc[df['FireplaceQu'].isna(),'FireplaceQu']='0'
df['FireplaceQu']=df['FireplaceQu'].map(lambda x: Cond_dict[x])