## Sample 1 of Exploratory Data Analysis
### EDA of a dataset of house prices and various attributes

In [None]:
import numpy as np
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
import seaborn as sns
import os

path = os.getcwd()
df = pd.read_csv(path+'/Data/House_Price.csv')

In [None]:
#   number of instances
#   number of features
#   number of categorical and numerical features
df.info()

In [None]:
df.describe()

In [None]:
#   top 5 numerical features highly correlated with target variable "SalePrice" according to pearson correlation
top_correlated = pd.Series.sort_values(df.corr(method='pearson',numeric_only=True)['SalePrice'], ascending=False)
top5 = top_correlated[0:6]
df[list(top5.to_dict().keys())].describe()

In [None]:
#   distribution plots for house prices
from sklearn.preprocessing import KBinsDiscretizer
from scipy.stats import skew, kurtosis

top5list = list(top5.to_dict().keys())
top5list

for i in range(len(top5list)):
    sk = skew(df[top5list[i]])
    ku = kurtosis(df[top5list[i]])
    out = 'skewness: '+str(sk)+'\nkurtosis: '+str(ku)
    plt.figure(i)
    plt.title(top5list[i])
    plt.figtext(0,0,out)
    plt.hist(df[top5list[i]])

In [None]:
#   with discretizer
for i in range(len(top5list)):
    sk = skew(df[top5list[i]])
    ku = kurtosis(df[top5list[i]])
    out = 'skewness: '+str(sk)+'\nkurtosis: '+str(ku)
    plt.figure(i)
    plt.title(top5list[i])
    plt.figtext(0,0,out)
    if i == 0:
        est = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='uniform')
        X = df['SalePrice'].array
        X = X.reshape(-1,1)
        est.fit(X)
        Xt = est.transform(X)
        plt.hist(Xt)
    if i == 1:
        est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
        X = df['OverallQual'].array
        X = X.reshape(-1,1)
        est.fit(X)
        Xt = est.transform(X)
        plt.hist(Xt)
    if i == 2:
        est = KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='uniform')
        X = df['GrLivArea'].array
        X = X.reshape(-1,1)
        est.fit(X)
        Xt = est.transform(X)
        plt.hist(Xt)
    if i == 3:
        est = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
        X = df['GarageCars'].array
        X = X.reshape(-1,1)
        est.fit(X)
        Xt = est.transform(X)
        plt.hist(Xt)
    if i == 4:
        est = KBinsDiscretizer(n_bins=9, encode='ordinal', strategy='uniform')
        X = df['GarageArea'].array
        X = X.reshape(-1,1)
        est.fit(X)
        Xt = est.transform(X)
        plt.hist(Xt)
    if i == 5:
        est = KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='uniform')
        X = df['TotalBsmtSF'].array
        X = X.reshape(-1,1)
        est.fit(X)
        Xt = est.transform(X)
        plt.hist(Xt)

In [None]:
#   check for missing values
missing_values = df.isna().sum()
missing_values = missing_values[missing_values != 0].sort_values(ascending=False)
print('number of features missing values: '+str(len(missing_values)))
for feature, value in missing_values.items():
    missing_pct = value/1460
    print(feature+' missing: '+str(value)+' pct: '+str(missing_pct))

3. EDA using clustering is very useful for understanding the important characteristics of the data.
Provide a further EDA on the dataset using Hierarchical clustering on the 5 numerical features found in 1(b)
to answer the question — “Does the house prices vary by neighbourhood?”. Report the output dendrogram
and any other plots and show how do they help you to answer the question.

In [None]:
#   dendogram output
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder

#   make df with top5 and neighbourhood
X = df[top5list[1:]+['Neighborhood']]
Y = df['Neighborhood'].array
Y = Y.reshape(-1,1)
enc = OrdinalEncoder()
Y = enc.fit_transform(Y)

#   encode neighbourhood labels into numbers
X['Neighborhood'] = Y
scaler = MinMaxScaler()
scaler.fit(X)

#   scale all values in df before agg clustering
X = scaler.transform(X)

def plot_dendrogram(model, **kwargs):
 # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

# setting distance_threshold=0 to ensure compute the full tree.
agg_clusters = AgglomerativeClustering(distance_threshold=0, n_clusters=None).fit(X)

plt.title("Hierarchical Agglomerative Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(agg_clusters, truncate_mode="level", p=2)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()