In [None]:
# notebooks/exploratory_data_analysis/eda.ipynb

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure display settings for better readability
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# Load the dataset
data_path = 'data/processed/processed_training_data.csv'  # Adjust the path as needed
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
df.head()

# Display basic information about the dataset
df.info()

# Display summary statistics of the numerical columns
df.describe()

# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
print("Missing values percentage:\n", missing_percentage)

# Visualize the distribution of numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns
for feature in numerical_features:
    plt.figure(figsize=(8, 6))
    sns.histplot(df[feature], kde=True, bins=30, color='skyblue')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

# Correlation heatmap to understand relationships between numerical features
plt.figure(figsize=(12, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Features')
plt.show()

# Visualize relationships between target and features (if applicable)
if 'target' in df.columns:
    target_column = 'target'
    
    # Scatter plot for numerical features vs target
    for feature in numerical_features:
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x=df[feature], y=df[target_column], color='green')
        plt.title(f'Relationship between {feature} and {target_column}')
        plt.xlabel(feature)
        plt.ylabel(target_column)
        plt.show()

# Visualize categorical variables
categorical_features = df.select_dtypes(include=[object]).columns
for feature in categorical_features:
    plt.figure(figsize=(8, 6))
    sns.countplot(x=df[feature], palette='Set2')
    plt.title(f'Count of categories in {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.show()

# Pairplot for initial feature relationships (only for a subset of features if dataset is large)
sns.pairplot(df[numerical_features].sample(100))  # Take a sample if the dataset is large
plt.suptitle('Pairplot of Selected Features', y=1.02)
plt.show()

# Feature Importance (if applicable, for example, using a RandomForest)
from sklearn.ensemble import RandomForestRegressor

if 'target' in df.columns:
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    # Get feature importance
    importance = model.feature_importances_
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': importance
    }).sort_values(by='importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.show()

# Save the cleaned data for further analysis and model training
df_cleaned_path = 'data/processed/cleaned_data.csv'  # Adjust the path as needed
df.to_csv(df_cleaned_path, index=False)
print(f"Cleaned data saved to {df_cleaned_path}")
