# Perform exploratory data analysis

Regression problem with around 80 features, combination of numerical and categorical in the form of Strings. Target value is SalePrice

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('display.max_columns', None)


In [None]:
# Grab the training data for analysis
train = pd.read_csv('../data/raw/train.csv')

In [None]:
# Remove the Id column for our analysis
df= train.drop('Id', axis=1)
print(df.shape)
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include=['O'])

In [None]:
missing = df.isnull().sum()
missing

In [None]:
numerical = df.select_dtypes(include=['int64', 'float64']).columns
categorical = df.select_dtypes(include=['object']).columns

In [None]:
n_df = df[numerical]
c_df = df[categorical]

In [None]:
n_df.hist(figsize=(16, 20))

In [None]:
# Create box plots for each feature
for i in range(0, len(numerical), 5):
    sns.pairplot(data=df,
                x_vars=numerical[i:i+5],
                y_vars=['SalePrice'])
    

In [None]:
# pairplots for categorical data
for i in range(0, len(categorical), 5):
    sns.pairplot(data=df,
                x_vars=categorical[i:i+5],
                y_vars=['SalePrice'])

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(n_df.corr())

In [None]:
# Store the correlation of each feature wrt SalePrice and the number of missing values in a dict
correlation_matrix = n_df.corr()

result_dict = {}
total = df.shape[0]

for column in n_df.columns:
    correlation_with_target = correlation_matrix['SalePrice'].get(column)  
    missing_values_count = df[column].isnull().sum() / total
    result_dict[column] = {'correlation': correlation_with_target, 'missing': missing_values_count}


In [None]:

# Assuming 'result_dict' contains the data as described in your code

# Extract feature names, correlations, and missing values
features = list(result_dict.keys())
correlations = [abs(result_dict[feature]['correlation']) for feature in features]
missing_values = [result_dict[feature]['missing'] for feature in features]

# Set up the figure and axis
fig, ax = plt.subplots(figsize=(10, 8))
index = np.arange(len(features))
bar_width = 0.35
opacity = 0.8

rects1 = plt.bar(index, missing_values, bar_width, alpha=opacity, color='b', label='Missing')
rects2 = plt.bar(index + bar_width, correlations, bar_width, alpha=opacity, color='g', label='Correlation')

plt.xlabel('Feature')
plt.ylabel('Scores')
plt.title('Scores by feature')
plt.xticks(index + bar_width, features, rotation=90)
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Grab some rows of the df where the LotFrontage values are missing
df[df['LotFrontage'].isnull()].head()

In [None]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0, cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.mean(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)))
    denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator/denominator)
    return eta

In [None]:
# Applying the function to our DataFrame
correlation_ratios = {}
for category in categorical:
    categories = df[category]
    measurements = df['SalePrice']
    eta = correlation_ratio(categories, measurements)
    correlation_ratios[category] = eta

In [None]:
# Get the missing value counts for each categorical feature and store in a dict
missing = {}
for category in categorical:
    missing[category] = df[category].isnull().sum()

# Normalize the missing values to be expressed as a percentage of the overall count as a number between 0 and 1
total = df.shape[0]
for category in categorical:
    missing[category] = missing[category] / total

# Combine the missing and correlation_ratios into a single dict
combined = {}
for category in categorical:
    combined[category] = (missing[category], correlation_ratios[category])

In [None]:
# Plot the combined correlation value and missing value counts into a single bar chart
fig, ax = plt.subplots(figsize=(10, 8))
index = np.arange(len(categorical))
bar_width = 0.35
opacity = 0.8

missing_values = [val[0] for val in combined.values()]
correlation_values = [val[1] for val in combined.values()]

rects1 = plt.bar(index, missing_values, bar_width,
alpha=opacity, color='b', label='Missing')

rects2 = plt.bar(index + bar_width, correlation_values, bar_width,
alpha=opacity, color='g', label='Correlation')

plt.xlabel('Feature')
plt.ylabel('Scores')
plt.title('Scores by feature')
plt.xticks(index + bar_width, categorical, rotation=90)
plt.legend()

plt.tight_layout()
plt.show()