In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm as anova

def EDA_plots(df, features = None, targets = ['SalePrice']):
    # features and targets are input as lists, targets defaults to only ['SalePrice']
    # You can input any other subset of column names as features or targets
    if features is None:
        features = df.columns
    
    
    for feature in features:
        for target in targets:
            if feature != target and feature != 'PID':
                print('feature: ',feature)
                scatter = px.scatter(x = df[feature], y = df[target])
                scatter.update_layout(
                    title={
                        'text': f'Scatterplot, {feature} vs {target}',
                        'y':0.95,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                    xaxis_title = f'{feature}',
                    yaxis_title = f'{target}'
                )
                scatter.show()
                hist = px.histogram(x = df[feature])
                hist.update_layout(
                    title={
                        'text': f'Distribution of {feature}',
                        'y':0.95,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                    xaxis_title = f'{feature}',
                    yaxis_title = 'Frequency'
                )
                hist.show()
                box = px.box(x = df[feature], y = df[target])
                box.update_layout(
                    title={
                        'text': f'Boxplot, {feature} vs {target}',
                        'y':0.95,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                    xaxis_title = f'{feature}',
                    yaxis_title = f'{target}'
                )
                box.show()
                temp = df[df[feature].isna() == False].reset_index(drop = True)
                if type(temp.loc[0, feature]) != str:
                    price_corr = temp[feature].corr(temp[target])
                    print(f'Correlation between {feature} and {target} is {price_corr}')
                    linreg = stats.linregress(temp[feature], temp[target] )
                    print(linreg)
                    print('r^2 = ',linreg.rvalue**2)
                if type(temp.loc[0, f'{feature}']) == str:
                    # this is to see full multiple regression on each value of categorical variable
                    # i.e. shows which ones are significantly different
                    fit = ols(f'{target} ~ C({feature})', data=temp).fit()
                    print(fit.summary()) # can comment this out and uncomment the below lines to get simpler anova
                    
                    # this is to see simple anova, i.e. whether any of the values are significantly different
                    #anova_table = anova(fit, typ=2)
                    #print(anova_table)
            print()