# **03 - CorrelationStudy**

## Objectives

* Identify the house attributes that correlate with the target variable: `SalePrice`

## Inputs

* outputs/datasets/cleaned/HousePricesCleaned.csv

## Outputs

* Data plots/visualizations for the Streamlit app. (Business Requirement 1)
* docs/plots/box_plot_price_by_KitchenQual.png
* docs/plots/box_plot_price_by_OverallQual.png
* docs/plots/heatmap_corr_pearson.png
* docs/plots/heatmap_corr_spearman.png
* docs/plots/heatmap_pps.png
* docs/plots/hist_plot_SalePrice.png
* docs/plots/line_plot_price_by_YearBuilt.png
* docs/plots/line_plot_price_by_YearRemodAdd.png
* docs/plots/lm_plot_price_by_1stFlrSF.png
* docs/plots/lm_plot_price_by_GarageArea.png
* docs/plots/lm_plot_price_by_GrLivArea.png
* docs/plots/lm_plot_price_by_MasVnrArea.png
* docs/plots/lm_plot_price_by_OpenPorchSF.png
* docs/plots/lm_plot_price_by_TotalBsmtSF.png

## Additional Comments

* This notebook deals with Business Requirement 1: Visualization of Data and Analysis of Correlations.

---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

Uses pandas library to load dataset into DataFrames df and displays the first 10 rows of the dataset.

In [None]:
import pandas as pd
df = pd.read_csv("outputs/datasets/cleaned/HousePricesCleaned.csv")
df.head(10)

Generates an exploratory data analysis (EDA) report of Dataframe df.

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df=df, minimal=True)
profile.to_notebook_iframe()

Applies one-hot encoding to categorical variables in DataFrame df.

In [None]:
from feature_engine.encoding import OneHotEncoder
encoder = OneHotEncoder(variables=df.columns[df.dtypes=='object'].to_list(), drop_last=False)
df_ohe = encoder.fit_transform(df)
print(df_ohe.shape)
df_ohe.head(5)

 Create a new directory called "docs/plots".

In [None]:
import os

try:
  os.makedirs("docs/plots")
except Exception as e:
  print(e)

Defines several functions to calculate and visualize relationships between features in a dataset using correlations and Power Predictive Score (PPS).

In [8]:
import numpy as np
import ppscore as pps
import seaborn as sns
sns.set(style="whitegrid")
import matplotlib.pyplot as plt
%matplotlib inline


def heatmap_corr(df,threshold, figsize=(20,12), font_annot = 8):
  """
  Function to create heatmap using correlations.
  """
  if len(df.columns) > 1:
    mask = np.zeros_like(df, dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    mask[abs(df) < threshold] = True

    fig, axes = plt.subplots(figsize=figsize)
    sns.heatmap(df, annot=True, xticklabels=True, yticklabels=True,
                mask=mask, cmap='viridis', annot_kws={"size": font_annot}, ax=axes,
                linewidth=0.5
                     )
    axes.set_yticklabels(df.columns, rotation = 0)
    plt.ylim(len(df.columns),0)
    if df.name == "corr_spearman":
      plt.savefig(f'docs/plots/heatmap_corr_spearman.png', bbox_inches='tight')
    else:
      plt.savefig(f'docs/plots/heatmap_corr_pearson.png', bbox_inches='tight')
    plt.show()


def heatmap_pps(df,threshold, figsize=(20,12), font_annot = 8):
    """
    Function to create heatmap using pps.
    """
    if len(df.columns) > 1:

      mask = np.zeros_like(df, dtype=bool)
      mask[abs(df) < threshold] = True

      fig, ax = plt.subplots(figsize=figsize)
      ax = sns.heatmap(df, annot=True, xticklabels=True,yticklabels=True,
                       mask=mask,cmap='rocket_r', annot_kws={"size": font_annot},
                       linewidth=0.05,linecolor='grey')
      
      plt.ylim(len(df.columns),0)
      plt.savefig(f'docs/plots/heatmap_pps.png', bbox_inches='tight')
      plt.show()


def CalculateCorrAndPPS(df):
  """
  Function to calculate correlations and pps.
  """
  df_corr_spearman = df.corr(method="spearman")
  df_corr_spearman.name = 'corr_spearman'
  df_corr_pearson = df.corr(method="pearson")
  df_corr_pearson.name = 'corr_pearson'

  pps_matrix_raw = pps.matrix(df)
  pps_matrix = pps_matrix_raw.filter(['x', 'y', 'ppscore']).pivot(columns='x', index='y', values='ppscore')

  pps_score_stats = pps_matrix_raw.query("ppscore < 1").filter(['ppscore']).describe().T
  print("PPS threshold - check PPS score IQR to decide threshold for heatmap \n")
  print(pps_score_stats.round(3))

  return df_corr_pearson, df_corr_spearman, pps_matrix


def DisplayCorrAndPPS(df_corr_pearson, df_corr_spearman, pps_matrix,CorrThreshold,PPS_Threshold,
                      figsize=(20,12), font_annot=8 ):
  """
  Function to display the correlations and pps.
  """

  print("\n")
  print("* Analyze how the target variable for your ML models are correlated with other variables (features and target)")
  print("* Analyze multi-colinearity, that is, how the features are correlated among themselves")

  print("\n")
  print("*** Heatmap: Spearman Correlation ***")
  print("It evaluates monotonic relationship \n")
  heatmap_corr(df=df_corr_spearman, threshold=CorrThreshold, figsize=figsize, font_annot=font_annot)

  print("\n")
  print("*** Heatmap: Pearson Correlation ***")
  print("It evaluates the linear relationship between two continuous variables \n")
  heatmap_corr(df=df_corr_pearson, threshold=CorrThreshold, figsize=figsize, font_annot=font_annot)

  print("\n")
  print("*** Heatmap: Power Predictive Score (PPS) ***")
  print(f"PPS detects linear or non-linear relationships between two columns.\n"
        f"The score ranges from 0 (no predictive power) to 1 (perfect predictive power) \n")
  heatmap_pps(df=pps_matrix,threshold=PPS_Threshold, figsize=figsize, font_annot=font_annot)

Suppresses FutureWarnings and calls the CalculateCorrAndPPS function on the DataFrame df_ohe, which computes both Spearman and Pearson correlation matrices, as well as the Power Predictive Score (PPS) matrix. The results are stored in the variables df_corr_pearson, df_corr_spearman, and pps_matrix for further analysis or visualization.

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
df_corr_pearson, df_corr_spearman, pps_matrix = CalculateCorrAndPPS(df_ohe)

Calls the DisplayCorrAndPPS function to visualize the correlation and Power Predictive Score (PPS) matrices.

In [None]:
DisplayCorrAndPPS(df_corr_pearson = df_corr_pearson,
                  df_corr_spearman = df_corr_spearman, 
                  pps_matrix = pps_matrix,
                  CorrThreshold = 0.4, PPS_Threshold =0.2,
                  figsize=(12,10), font_annot=10)

Calculates the Spearman correlation coefficients between all columns in the df_ohe DataFrame and the target variable `SalePrice`

In [None]:
corr_spearman = df_ohe.corr(method='spearman')['SalePrice'].sort_values(key=abs, ascending=False)[1:].head(10)
corr_spearman

Calculates the Pearson correlation coefficients between all columns in the df_ohe DataFrame and the target variable `SalePrice`

In [None]:
corr_pearson = df_ohe.corr(method='pearson')['SalePrice'].sort_values(key=abs, ascending=False)[1:].head(10)
corr_pearson

Creates a set containing the top 10 features most strongly correlated with `SalePrice`

In [None]:
top_n = 10
set(corr_pearson[:top_n].index.to_list() + corr_spearman[:top_n].index.to_list())

Defines a list called vars_to_study that contains variables which are considered important for further analysis.

In [None]:
vars_to_study = [
    '1stFlrSF',
    'GarageArea',
    'GrLivArea',
    'KitchenQual',
    'MasVnrArea',
    'OpenPorchSF',
    'OverallQual',
    'TotalBsmtSF',
    'YearBuilt',
    'YearRemodAdd'
]
vars_to_study

Creates a new DataFrame called df_eda by filtering the original df DataFrame to include only the columns specified in the vars_to_study and target variable `SalePrice`

In [None]:
df_eda = df.filter(vars_to_study + ['SalePrice'])
df_eda.head()

Defines a function to create and save a histogram of the `SalePrice` distribution with a KDE overlay.

In [None]:
sns.set_style('whitegrid')
target_var = 'SalePrice'

def plot_target_hist(df, target_var):
  """
  Function to create histogram of the values of target.
  Save histogram into docs/plots for use in dashboard
  """
  plt.figure(figsize=(12, 6))
  sns.histplot(data=df, x=target_var, kde=True)
  plt.title(f"Distribution of {target_var}", fontsize=20)
  plt.savefig(f'docs/plots/hist_plot_{target_var}.png', bbox_inches='tight')        
  plt.show()

plot_target_hist(df, target_var)

Defines three functions to visualize the relationship between the target variable `SalePrice` and various features.

In [None]:
time = ['YearBuilt', 'YearRemodAdd']

def plot_lm(df, col, target_var):
  """
  Function to create linear regression plots of the target and
  features with continuous values.
  Figures created saved to folder docs/plots
  """
  plt.figure(figsize=(12, 5))
  sns.lmplot(data=df, x=col, y=target_var, ci=None)
  plt.title(f"{col}", fontsize=20)
  plt.savefig(f'docs/plots/lm_plot_price_by_{col}.png', bbox_inches='tight')        
  plt.show()

def plot_line(df, col, target_var):
  """
  Function creates a line plot of the target and time variables.
  Figures created saved to folder docs/plots
  """
  plt.figure(figsize=(12, 5))
  sns.lineplot(data=df, x=col, y=target_var)
  plt.title(f"{col}", fontsize=20)
  plt.savefig(f'docs/plots/line_plot_price_by_{col}.png', bbox_inches='tight')        
  plt.show()

def plot_box(df, col, target_var):
  """
  Function creates a box plot of the target versus categorical variables.
  Figures created saved to folder docs/plots
  """
  plt.figure(figsize=(8, 5))
  sns.boxplot(data=df, x=col, y=target_var) 
  plt.title(f"{col}", fontsize=20)
  plt.savefig(f'docs/plots/box_plot_price_by_{col}', bbox_inches='tight')
  plt.show()


for col in vars_to_study:
  if len(df_eda[col].unique()) <= 10:
    plot_box(df_eda, col, target_var)
    print("\n\n")
  else:
    if col in time:
      plot_line(df_eda, col, target_var)
      print("\n\n")
    else:
      plot_lm(df_eda, col, target_var)
      print("\n\n")

## Conclusions and Next Steps