# Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.notebook.pandas_utils import display_dfs, get_dfs_info, read_file

## Summary Statistics

In [None]:
file_path = 'PATH'
df = read_file(file_path)

# Display Basic Information
display_dfs({'Summary': get_dfs_info(df), 'Df Describe': df.describe()})

## Univariate Analysis

In [None]:
# Histogram for a numerical variable
plt.figure(figsize=(10, 6))
sns.histplot(df['numerical_variable'], kde=True)
plt.title('Distribution of Numerical Variable')
plt.xlabel('Numerical Variable')
plt.ylabel('Frequency')
plt.show()

# Count plot for a categorical variable
plt.figure(figsize=(10, 6))
sns.countplot(x='categorical_variable', data=df)
plt.title('Distribution of Categorical Variable')
plt.xlabel('Categorical Variable')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

## Bivariate/Multivariate Analysis

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='numerical_variable_1', y='numerical_variable_2', data=df)
plt.title('Relationship between Two Numerical Variables')
plt.xlabel('Numerical Variable 1')
plt.ylabel('Numerical Variable 2')
plt.show()

In [None]:
# Selecting a subset of variables to avoid clutter
subset_df = df[['numerical_variable_1', 'numerical_variable_2', 'categorical_variable']]
sns.pairplot(subset_df, hue='categorical_variable')
plt.show()

## Detecting and Handling Outliers

In [None]:
"""Visualizing Outliers"""
# Boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['column_name'])
plt.title('Box Plot for Detecting Outliers in Numerical Variable')
plt.show()

# Scatter plot (if you have a second variable 'y')
plt.scatter(df['column_name'], df['y'])
plt.show()

In [None]:
"""Detecting Outliers with Statistical Methods (Interquartile Range)"""
Q1 = df['column_name'].quantile(0.25)
Q3 = df['column_name'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['numerical_variable'] < lower_bound) | (df['numerical_variable'] > upper_bound)]
print(f"Number of outliers in 'numerical_variable': {outliers.shape[0]}")


"""Handling Outliers"""
# Removing outliers
df_no_outliers = df[~((df['column_name'] < (lower_bound)) | (df['column_name'] > (upper_bound)))]

# Capping outliers
df['column_name'] = np.where(df['column_name'] < (lower_bound), lower_bound, df['column_name'])
df['column_name'] = np.where(df['column_name'] > (upper_bound), upper_bound, df['column_name'])

## Identifying Patterns

## Preliminary Hypothesis Testing

## Summary of Insights