In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm as anova
import itertools
from sklearn import linear_model
from numpy import ones,vstack
from numpy.linalg import lstsq

In [3]:
df=pd.read_csv('../data/ames_housing_price_data_v2.csv', index_col=0)

In [33]:
pd.options.display.max_rows=400

In [12]:
typedict = {'PID' : 'nominal',
            'SalePrice' : 'continuous',
            #Matt
            'LotFrontage' : 'continuous', 
            'LotArea' : 'continuous',
            'maybe_LotShape' : 'nominal',
            'LandSlope' : 'nominal', 
            'LandContour' : 'nominal', 
            'maybe_MSZoning' : 'nominal', 
            'Street_paved' : 'nominal', 
            'Alley' : 'nominal',
            'Neighborhood' : 'nominal', 
            'drop_LotConfig' : 'nominal', 
            'drop_Condition1' : 'nominal', 
            'drop_Condition2' : 'nominal',
            'Foundation' : 'nominal',
            'Utilities' : 'nominal',
            'Heating' : 'nominal',
            'HeatingQC_nom' : 'ordinal',
            'CentralAir' : 'nominal',
            'Electrical' : 'nominal',
            'HeatingQC_ord' : 'ordinal',
            'LotShape_com' : 'nominal',
            'MSZoning_com' : 'nominal',
            'LF_Normal' : 'nominal',
            'LF_Near_NS_RR' : 'nominal',
            'LF_Near_Positive_Feature' : 'nominal',
            'LF_Adjacent_Arterial_St' : 'nominal',
            'LF_Near_EW_RR' : 'nominal',
            'LF_Adjacent_Feeder_St' : 'nominal',
            'LF_Near_Postive_Feature' : 'nominal',
            'Heating_com' : 'nominal',
            'Electrical_com' : 'nominal',
            'LotConfig_com' : 'nominal', 
            'LotFrontage_log' : 'continuous',
            'LotArea_log' : 'continuous',
            #Oren 
            'MiscFeature': 'Nominal',
            'Fireplaces': 'Discrete',
            'FireplaceQu': 'Ordinal',
            'PoolQC': 'Ordinal',
            'PoolArea': 'Continuous',
            'PavedDrive': 'Nominal',
            'ExterQual': 'Ordinal',
            'OverallQual': 'Ordinal',
            'drop_OverallCond': 'Ordinal',
            'MiscVal': 'Continuous',
            'YearBuilt': 'Discrete',
            'YearRemodAdd': 'Discrete',
            'KitchenQual': 'Ordinal',
            'Fence': 'Ordinal',
            'RoofStyle': 'Nominal',
            'RoofMatl': 'Nominal',
            'maybe_Exterior1st': 'Nominal',
            'drop_Exterior2nd': 'Nominal',
            'drop_ExterCond': 'Ordinal',
            'maybe_MasVnrType': 'Nominal',
            'MasVnrArea': 'Continuous',
            #Mo
            #Basement
            'BsmtQual_ord': 'Ordinal',
            'BsmtCond_ord': 'Ordinal',
            'BsmtExposure_ord': 'Ordinal',
            'BsmtQual_ord_lin': 'Ordinal',
            'BsmtCond_ord_lin': 'Ordinal',
            'BsmtExposure_ord_lin': 'Ordinal',
            'TotalBsmtSF': 'Continuous',
            'BSMT_GLQ':'Continuous', 
            'BSMT_Rec':'Continuous',
            'maybe_BsmtUnfSF': 'Continuous',
            'maybe_BSMT_ALQ':'Continuous',
            'maybe_BSMT_BLQ':'Continuous', 
            'maybe_BSMT_LwQ':'Continuous', 
            'drop_BsmtQual': 'Nominal',
            'drop_BsmtCond': 'Nominal',
            'drop_BsmtExposure': 'Nominal',
            'drop_BsmtFinType1': 'Nominal',
            'drop_BsmtFinSF1': 'Continuous',
            'drop_BsmtFinType2': 'Nominal',
            'drop_BsmtFinSF2': 'Continuous',
            #Deck
            'WoodDeckSF':'Continuous', 
            'OpenPorchSF':'Continuous', 
            'ScreenPorch':'Continuous',
            'maybe_EnclosedPorch':'Continuous',
            'maybe_3SsnPorch':'Continuous',
            #Garage
            'GarageFinish':'Nominal', 
            'GarageYrBlt':'Continuous',
            'GarageCars':'Ordinal',
            'GarageArea':'Continuous',
            'GarageType_con':'Nominal',
            'maybe_GarageQual':'Nominal', 
            'maybe_GarageCond':'Nominal',
            'drop_GarageType':'Nominal'
}

In [25]:
def EDA_plots(df, features = df.columns, targets = ['SalePrice'], diction = ['typedict']):
    # can pass features = [list of features] and targets = [list of targets]
    # to get plots and regressions of different variables
    for feature in features:
        for target in targets:
            if feature != target and feature != 'PID':
                print('feature: ',feature)
                if diction[feature] == 'continuous': 
                    scatter = px.scatter(x = df[f'{feature}'], y = df[f'{target}'])
                    scatter.update_layout(
                        title={
                            'text': f'Scatterplot, {feature} vs {target}',
                            'y':0.95,
                            'x':0.5,
                            'xanchor': 'center',
                            'yanchor': 'top'},
                        xaxis_title = f'{feature}',
                        yaxis_title = f'{target}'
                    )
                    scatter.show()
                if diction[feature] == 'ordinal': 
                    hist = px.histogram(x = df[f'{feature}'])
                    hist.update_layout(
                        title={
                            'text': f'Distribution of {feature}',
                            'y':0.95,
                            'x':0.5,
                            'xanchor': 'center',
                            'yanchor': 'top'},
                        xaxis_title = f'{feature}',
                        yaxis_title = 'Frequency'
                    )
                    hist.show()
                if diction[feature] == 'nominal': 
                    box = px.box(x = df[f'{feature}'], y = df[f'{target}'])
                    box.update_layout(
                        title={
                            'text': f'Boxplot, {feature} vs {target}',
                            'y':0.95,
                            'x':0.5,
                            'xanchor': 'center',
                            'yanchor': 'top'},
                        xaxis_title = f'{feature}',
                        yaxis_title = 'Frequency'
                    )
                    box.show()
#                 temp = df[df[f'{feature}'].isna() == False].reset_index(drop = True)
#                 if type(temp.loc[0, f'{feature}']) != str:
#                     price_corr = temp[f'{feature}'].corr(temp[f'{target}'])
#                     print(f'Correlation between {feature} and {target} is {price_corr}')
#                     linreg = stats.linregress(temp[f'{feature}'], temp[f'{target}'] )
#                     print(linreg)
#                     print('r^2 = ',linreg.rvalue**2)
#                 if type(temp.loc[0, f'{feature}']) == str:
#                     # this is to see full multiple regression on each value of categorical variable
#                     # can comment this out
#                     fit = ols(f'{target} ~ C({feature})', data=temp).fit()
#                     print(fit.summary())
#                     # this is to see anova on whether any value of categorical variable is significantly different
#                     #anova_table = anova(fit, typ=2)
#                     #print(anova_table)
            print()

In [27]:
EDA_plots(df, features = ['LotArea'])

feature:  LotArea


TypeError: list indices must be integers or slices, not str

In [21]:
typedict['GrLivArea']

KeyError: 'GrLivArea'

In [34]:
df[(df.index==908154205) | (df.index==902207130)].T

PID,908154205,902207130
GrLivArea,4676,832
SalePrice,184750,12789
maybe_MSZoning,RL,RM
LotFrontage,130.0,68.0
LotArea,40094,9656
Street_paved,Pave,Pave
Alley,No alley access,No alley access
maybe_LotShape,IR1,Reg
LandContour,Banked (rise from street level to building),Level
Utilities,EGWS,EGWS
