In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm as anova

def EDA_plots(df, features = None, targets = ['SalePrice']):
    '''
    EDA_plots: a function to automate basic preliminary EDA on [features] vs [targets]
    
    args:
        df: a dataframe
        features: a list of column names to run the basic EDA functions on. If nothing is passed, all features will be used.
        targets: a list of column names to use as targets in the basic EDA functions. If nothing is passed, ['SalePrice'] 
                 will be used.
        
    output:
        - Prints scatterplots and boxplots of [features] vs [targets]
        - Prints histograms of [features]
        - Uses crude method of datatype == str to decide whether to treat a feature as categorical or continuous
            - If the data in a column is of string type, dummifies that column and runs multiple linear regressions 
              on the dummies vs [targets]
            - If you would rather do simple anovas than the dummified multiple regressions, there is commented-out
              code which you can uncomment to do anovas instead
            - If the data in a column is not string type, runs simple linear regressions on that column vs [targets] 
              and prints the correlation and R^2 values
    '''
    # default features is None, in which case the function will use all features due to the first 2 lines below
    if features is None:
        features = df.columns
    
    
    for feature in features:
        for target in targets:
            if feature != target and feature != 'PID': # ignore unique identifier
                print('feature: ',feature) # print feature name
                # scatterplot
                scatter = px.scatter(x = df[feature], y = df[target])
                scatter.update_layout(
                    title={
                        'text': f'Scatterplot, {feature} vs {target}',
                        'y':0.95,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                    xaxis_title = f'{feature}',
                    yaxis_title = f'{target}'
                )
                scatter.show()
                # histogram
                hist = px.histogram(x = df[feature])
                hist.update_layout(
                    title={
                        'text': f'Distribution of {feature}',
                        'y':0.95,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                    xaxis_title = f'{feature}',
                    yaxis_title = 'Frequency'
                )
                hist.show()
                # boxplot
                box = px.box(x = df[feature], y = df[target])
                box.update_layout(
                    title={
                        'text': f'Boxplot, {feature} vs {target}',
                        'y':0.95,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                    xaxis_title = f'{feature}',
                    yaxis_title = f'{target}'
                )
                box.show()
                # the dataset does not have unique indices, this fixes that
                temp = df[df[feature].isna() == False].reset_index(drop = True)
                
                if type(temp.loc[0, feature]) != str: # continuous
                    corr = temp[feature].corr(temp[target])
                    print(f'Correlation between {feature} and {target} is {corr}')
                    linreg = stats.linregress(temp[feature], temp[target] )
                    print(linreg)
                    print('r^2 = ',linreg.rvalue**2)
                if type(temp.loc[0, f'{feature}']) == str: # categorical
                    fit = ols(f'{target} ~ C({feature})', data=temp).fit()
                    print(fit.summary()) # comment this out and uncomment the below lines to get simpler anova
                    # anova_table = anova(fit, typ=2)
                    # print(anova_table)
            print()