In [None]:
def import_housing_data(url) :
    import pandas as pd
    df = pd.read_csv(url)
    df.drop(columns=['Id'], inplace=True)
    df.dropna(axis=1, inplace=True)

    #this little guy finds any column name that starts with a number and ads a string in front of it so you don't have issues later
    for col in df :
      if col[0].isdigit():
        nums = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
        df.rename(columns={col:nums[int(col[0])] + '_' + col}, inplace = True)
    return df

# For VSCode
# import sys
# sys.path.append('/Textbook Examples/')
# import functions as fun

#For Colab
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/')
import functions as fun


import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.options.display.float_format = '{:.5f}'.format
df = import_housing_data('http://www.ishelp.info/data/housing_full.csv')
df.head()

In [None]:
fun.unistats(df)

# Interpreting
    # Numeric 
        # Skewness above 1 or -1 (positive is right skewed, negative is left skewed)
    # Categorical
        # High number of Unique values when compared to rest of dataframe (ex. Neighborhood )


In [None]:

fun.bivstats(df, 'SalePrice')



# Bivariate 
    # Num-Num: Correlation R
    # Num-Cat: one-way ANOVA (3+ groups) or T-test (2 groups)
    # Cat-Cat: Chi-square



# Importance in terms of Correlation
# it makes sense that as corr gets smaller, p value (likelihood that what we found is due to chance) gets higher. He cares more about effect size than p value
# F
    # look at what has the largest effect size relative to this dataframe
# r
    # first look what numbers are closest to one
    # then go look at skewness above to see if there are any issues


In [None]:
#scatter plot vizzes

fun.biv_viz(df, 'SalePrice')

# Things to look for 
    # Scatter Plots (num-num)
        # are individual variables normally distributed? Skew and Kurt (you can also look at histograms on sides of jointplot)
        # Heteroscadicisity - the spread of the dots is evenly distributed throughout all values of X (in this chart, it is not)
            # White Test - low pvalue means YES heteroscadiscity issues
            # BP Test - low pvalue means YES heteroscadiscity issues
        # Outliers - do you have a dot or two that is FAR away from most other dots? (so far we haven't learned this)
    # Bar Charts (Anova - cat/num)
        # which of the bars are the highest, and have the smallest stDev (lil tick)

#there was a part in the video (around 18:50 in Python Practice: Housing Prices: Bivariate Vizualiations) where he was talking about how we need to come up with a way to keep track of all the ones (low p in white and BP)(skew and kurt over 1) that have issues but I have not idea how lol

In [None]:
def mlr_prepare(df) :
  import numpy as np
  import pandas as pd
  from sklearn import preprocessing


  #creates dummy variables for you
  for col in df:
      if not pd.api.types.is_numeric_dtype(df[col]) :
        df = df.join(pd.get_dummies(df[col], prefix=col, drop_first=False))

  #only has numerics
  df = df.select_dtypes(np.number)
  #if something is coming out as an list of lists and should be a df, copy the code below this comment to fix that
  df_minmax = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(df), columns=df.columns)
  return df_minmax


df = mlr_prepare(import_housing_data('http://www.ishelp.info/data/housing_full.csv'))

In [None]:
#run the MLR
def mlr(df, label) :
  import numpy as np
  import pandas as pd
  import statsmodels.api as sm

  y = df[label]
  X = df.drop(columns=[label]).assign(const=1)

  results = sm.OLS(y, X).fit()
  return results


results = mlr(df, 'SalePrice')
results.summary()

#Things to look at
# R squared
# Adj R Squared - if it is close to R, that means we have a lot of variables that aren't doing us a whole lot of good

In [None]:
def mlr_feature_df(results):
  df_features = pd.DataFrame({'coef':results.params, 't':abs(results.tvalues), 'p':results.pvalues})
  df_features.drop(labels=['const'], inplace=True)
  df_features.sort_values(by=['t','p'])
  return df_features
mlr_feature_df(results)

In [None]:

import pandas as pd
from sklearn.linear_model import LinearRegression

def vif(df):
  # initialize dictionaries
  vif_dict, tolerance_dict = {}, {}

  # form input data for each exogenous variable
  for col in df.drop(columns=['const']):
    y = df[col]
    X = df.drop(columns=[col])
    
    # extract r-squared from the fit
    r_squared = LinearRegression().fit(X, y).score(X, y)

    # calculate VIF
    if r_squared < 1: # Prevent division by zero runtime error
      vif = 1/(1 - r_squared) 
    else:
      vif = 1
    vif_dict[col] = vif

    # calculate tolerance
    tolerance = 1 - r_squared
    tolerance_dict[col] = tolerance

  return pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict}).sort_values(by=['VIF'], ascending=False)

vif(X)

In [None]:
def mlr_fit(results, actual, roundto=10): #Calculate fit statistics, create a record entry for the modeling results table
  import numpy as np

  df_features = mlr_feature_df(results) #Generate feature table taht allows sorting coef labels based on t and p
  residuals = np.array(actual) - np.array(results.fittedvalues)
  rmse = np.sqrt(sum((residuals**2))/len(actual))
  mae = np.mean(abs(np.array(actual) - np.array(results.fittedvalues)))
  fit_stats = [round(results.rsquared, roundto), round(results.rsquared_adj, roundto), round(results.rsquared - results.rsquared_adj, roundto), round(rmse, roundto), round(mae, roundto), [df_features.index.values]]
  return fit_stats

In [None]:
def mlr_step(df, label, min=2): #Control mlr and mlr_fit by removing a certain criterion of feature one at a time

  #create the empty model results table
  df_models = pd.DataFrame(columns=['R2', 'R2a', 'diff', 'rmse', 'MAE', 'features'])

  #prepare the data by generating dummies and scaling
  df = mlr_prepare(df)

  #run the first model with all features
  results = mlr(df, label)

  #generate the fit statistics for the og model
  df_models.loc[str(len(results.params))] = mlr_fit(results, df[label], 10)

  # Generate feature table that allows sorting coef labels based on t and p 
  df_features = mlr_feature_df(results) 

  # Step through a series of reduced models until you 
  while len(results.params) >= min:                 # Keep looping as long as there are at least a minimum number of features left 
    df = df.drop(columns=[df_features.index[0]])    # Drop the least effective feature 
    results = mlr(df, label)                        # Re-run the next MLR 
    df_features = mlr_feature_df(results)           # Re-generate the features summary table 
    df_models.loc[len(results.params)] = mlr_fit(results, df[label], 10) 

  # Save the full models table to a CSV 
  df_models.to_csv('/content/drive/My Drive/Colab Notebooks/' + label + '.csv') 

  # Return to display a shortened version without feature list 
  df_models.drop(columns=['features'], inplace=True) 

  return df_models 
  

In [None]:
df_models = mlr_step(import_housing_data('http://www.ishelp.info/data/housing_full.csv'), 'SalePrice')
df_models

In [None]:
def mlr_prepare(df) :
  import numpy as np
  import pandas as pd
  from sklearn import preprocessing


  #creates dummy variables for you
  for col in df:
      if not pd.api.types.is_numeric_dtype(df[col]) :
        df = df.join(pd.get_dummies(df[col], prefix=col, drop_first=False))

  #only has numerics
  df = df.select_dtypes(np.number)
  #if something is coming out as an list of lists and should be a df, copy the code below this comment to fix that
  df_minmax = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(df), columns=df.columns)
  return df_minmax



#run the MLR
def mlr(df, label) :
  import numpy as np
  import pandas as pd

  y = df_minmax[label]
  X = df_minmax.drop(columns=[label, 'Utilities_AllPub']).assign(const=1)

  results = sm.OLS(y, X).fit()
  return results
results = mlr(df, 'SalePrice')

#Things to look at
# R squared
# Adj R Squared - if it is close to R, that means we have a lot of variables that aren't doing us a whole lot of good


def mlr_feature_df(results):
  df_features = pd.DataFrame({'coef':results.params, 't':abs(results.tvalues), 'p':results.pvalues})
  df_features.drop(labels=['const'], inplace=True)
  df_features.sort_values(by=['t','p'])
  return df_features
mlr_feature_df(results)



def mlr_fit(results, actual, roundto=10): #Calculate fit statistics, create a record entry for the modeling results table
  import numpy as np

  df_features = mlr_feature_df(results) #Generate feature table taht allows sorting coef labels based on t and p
  residuals = np.array(actual) - np.array(results.fittedvalues)
  rmse = np.sqrt(sum((residuals**2))/len(actual))
  mae = np.mean(abs(np.array(actual) - np.array(results.fittedvalues)))
  fit_stats = [round(results.rsquared, roundto), round(results.rsquared_adj, roundto), round(results.rsquared - results.rsquared_adj, roundto), round(rmse, roundto), round(mae, roundto), [df_features.index.values]]
  return fit_stats


def mlr_step(df, label, min=2): #Control mlr and mlr_fit by removing a certain criterion of feature one at a time

  #create the empty model results table
  df_models = pd.DataFrame(columns=['R2', 'R2a', 'diff', 'rmse', 'MAE', 'features'])

  #prepare the data by generating dummies and scaling
  df = mlr_prepare(df)

  #run the first model with all features
  results = mlr(df, label)

  #generate the fit statistics for the og model
  df_models.loc[str(len(results.params))] = mlr_fit(results, df[label], 10)

  # Generate feature table that allows sorting coef labels based on t and p 
  df_features = mlr_feature_df(results) 

  # Step through a series of reduced models until you 
  while len(results.params) >= min:                 # Keep looping as long as there are at least a minimum number of features left 
    df = df.drop(columns=[df_features.index[0]])    # Drop the least effective feature 
    results = mlr(df, label)                        # Re-run the next MLR 
    df_features = mlr_feature_df(results)           # Re-generate the features summary table 
    df_models.loc[len(results.params)] = mlr_fit(results, df[label], 10) 

  # Save the full models table to a CSV 
  df_models.to_csv('/content/drive/My Drive/Colab Notebooks/' + label + '.csv') 

  # Return to display a shortened version without feature list 
  df_models.drop(columns=['features'], inplace=True) 

  return df_models 



df_models = mlr_step(import_housing_data('http://www.ishelp.info/data/housing_full.csv'), 'SalePrice')
df_models


In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(15,6))
ax = sns.lineplot(df_models.index, df_models.R2)
ax = sns.lineplot(df_models.index, df_models.R2a)
ax.set(ylin=(.88, .921));

# the correct number of variables you should have is from where the plot evens out, and anything to the left of that


I made it up to 7:15 in the 5th video, but my code from 2 blocks up is taking 5ever to run. It might be an issue with my code 3 blocks up but I don't see what it could be. From stack overflow it looks like it might be an issue with the dropna axis