# Ames Data Analysis

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway, pearsonr
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import OLSInfluence

from pipeline_v1 import ordinal, nominal, continuous, discrete

## Helper Functions

In [3]:
def get_formulas(features):
    formulas = []
    for feature in features:
        formula = "Sale_Price ~ C(" + feature + ")"
        formulas.append(formula)
    return formulas

In [4]:
# Get Data
filename = "../data/external/Ames_data.csv"
ames = pd.read_csv(filename)
X = ames.drop(columns=["PID","Sale_Price"])
y = ames["Sale_Price"].to_numpy()
ames_df = ames.drop(columns="PID")

(2930, 82)


## Sale Condition

In [5]:
print(X[X["Sale_Condition"] != "Normal"].shape)
print(X[X["Sale_Condition"] != "Normal"].shape[0]/X.shape[0])

(517, 82)
0.1764505119453925


## Neighborhoods


In [6]:
print(X["Neighborhood"].unique())

['North_Ames' 'Gilbert' 'Stone_Brook' 'Northwest_Ames' 'Somerset'
 'Briardale' 'Northpark_Villa' 'Northridge_Heights' 'Bloomington_Heights'
 'Northridge' 'Sawyer_West' 'Sawyer' 'Greens' 'Brookside' 'Old_Town'
 'Iowa_DOT_and_Rail_Road' 'Clear_Creek'
 'South_and_West_of_Iowa_State_University' 'Edwards' 'College_Creek'
 'Crawford' 'Blueste' 'Mitchell' 'Timberland' 'Meadow_Village' 'Veenker'
 'Green_Hills' 'Landmark']


## Outlier Detection

In [7]:
def get_cooks(X, PID, features, formulas):
    measures = pd.DataFrame()
    for PID, feature, formula in zip(PID,features,formulas):    
        mu = np.mean(X[feature])
        model = smf.ols(formula,data=X).fit()    
        infl = model.get_influence()
        c, p = infl.cooks_distance    
        norm_cooks = c / mu
        d = {"PID": PID, "Feature": feature, "mu": mu, "Cooks": c,"Normalized": norm_cooks,  "p-value": p}
        df = pd.DataFrame(data=d)
        measures = pd.concat((measures, df), axis=0)
    return measures
features = continuous + discrete
features.remove("Age")
features.remove("Garage_Age")
formulas = get_formulas(features)
cooks = get_cooks(ames_df, PID, features, formulas)
filepath = "../reports/cooks.csv"
cooks.to_csv(filepath, index=False)   

ValueError: list.remove(x): x not in list

In [13]:
def analyze_outliers(X, cooks, cutoff=3):
    outlier_stats = cooks[cooks["Normalized"]>=cutoff].sort_values(by=["PID", "Feature"])
    outliers = outlier_stats["PID"].unique()
    n_outliers = len(outliers)
    pct_outliers = n_outliers / X.shape[0] * 100
    print(f"\nThere are {n_outliers} outliers ({pct_outliers}%) at cutoff={cutoff}.")
    print(outlier_stats)
for i in range(3,7):
    analyze_outliers(X, cooks, i)


There are 21 outliers (0.7167235494880546%) at cutoff=3.
            PID             Feature            mu        Cooks   Normalized  \
2321  526301100        Lot_Frontage     57.647782          inf          inf   
37    526350040            Lot_Area  10147.921843          inf          inf   
76    526350040            Lot_Area  10147.921843          inf          inf   
105   526350040            Lot_Area  10147.921843          inf          inf   
121   526350040            Lot_Area  10147.921843          inf          inf   
136   526350040            Lot_Area  10147.921843          inf          inf   
152   526350040            Lot_Area  10147.921843          inf          inf   
209   526350040            Lot_Area  10147.921843          inf          inf   
210   526350040            Lot_Area  10147.921843          inf          inf   
216   526350040            Lot_Area  10147.921843          inf          inf   
254   526350040            Lot_Area  10147.921843          inf          i

## Continuous Variables
### Correlation  

In [14]:
corr = []
p_values = []
features = continuous + discrete
for feature in continuous:
    r, p = pearsonr(X[feature],y)
    corr.append(r)
    p_values.append(p)
d = {"Feature": continuous, "Correlation": corr, "Importance": np.abs(corr), "p-values":p_values}
df = pd.DataFrame(data=d)
df = df.sort_values(by="Importance", ascending=False)
print(df)


    

               Feature  Correlation  Importance       p-values
10         Gr_Liv_Area     0.706780    0.706780   0.000000e+00
11         Garage_Area     0.640138    0.640138   0.000000e+00
6        Total_Bsmt_SF     0.632529    0.632529   0.000000e+00
7         First_Flr_SF     0.621676    0.621676  5.687256e-313
2         Mas_Vnr_Area     0.502196    0.502196  4.881360e-187
12        Wood_Deck_SF     0.327143    0.327143   4.820100e-74
13       Open_Porch_SF     0.312951    0.312951   1.373555e-67
8        Second_Flr_SF     0.269373    0.269373   6.941583e-50
1             Lot_Area     0.266549    0.266549   7.633843e-49
0         Lot_Frontage     0.201875    0.201875   2.547312e-28
5          Bsmt_Unf_SF     0.183308    0.183308   1.478791e-23
3         BsmtFin_SF_1    -0.134905    0.134905   2.254104e-13
14      Enclosed_Porch    -0.128787    0.128787   2.607945e-12
16        Screen_Porch     0.112151    0.112151   1.148246e-09
17           Pool_Area     0.068403    0.068403   2.110

## 

In [15]:
def get_formulas(features):
    formulas = []
    for feature in features:
        formula = "Sale_Price ~ C(" + feature + ")"
        formulas.append(formula)
    return formulas

def get_f(features, formulas):    
    p = []
    s = []
    for formula in formulas:
        model = smf.ols(formula,data=ames_df).fit()
        aov_table = sm.stats.anova_lm(model, typ=2)
        p.append(aov_table['PR(>F)'][0])
        sig = "Significance" if aov_table['PR(>F)'][0]<0.05 else "Not Signficant"
        s.append(sig)
    d = {"Feature": features, "PR(>F)": p, "Significant": s}
    df = pd.DataFrame(d)
    return df

formulas = get_formulas(ordinal)
results_ord = get_f(ordinal, formulas)
formulas = get_formulas(nominal)
results_nom = get_f(nominal, formulas)
print(results_ord)
print("\n")
print(results_nom)

           Feature         PR(>F)     Significant
0   BsmtFin_Type_1  5.579684e-160    Significance
1   BsmtFin_Type_2   1.143399e-17    Significance
2        Bsmt_Cond   2.061283e-31    Significance
3    Bsmt_Exposure  1.188864e-124    Significance
4        Bsmt_Qual   0.000000e+00    Significance
5       Electrical   2.713510e-36    Significance
6       Exter_Cond   1.611921e-16    Significance
7       Exter_Qual   0.000000e+00    Significance
8            Fence   1.136202e-23    Significance
9     Fireplace_Qu  2.543386e-229    Significance
10      Functional   6.270286e-09    Significance
11     Garage_Cond   2.249954e-52    Significance
12   Garage_Finish  5.458334e-233    Significance
13     Garage_Qual   3.973437e-57    Significance
14      Heating_QC  4.917676e-152    Significance
15    Kitchen_Qual   0.000000e+00    Significance
16      Land_Slope   1.011949e-03    Significance
17       Lot_Shape   1.067863e-60    Significance
18    Overall_Cond   9.118002e-98    Significance
