# Exploratory Data Analysis

This notebook is used for exploratory data analysis (EDA) of the dataset. It includes code for generating plots and summarizing key statistics.

In [None]:
%pip install seaborn

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Define the path to CSV files
csv_path = Path(r"C:\RnD\MedHack\archive\MedHack AI Hospital\csv")

# List all CSV files in the directory
csv_files = list(csv_path.glob('*.csv'))
print("Available CSV files:")
for file in csv_files:
    print(f"- {file.name}")

# Load all CSV files into a dictionary
data_dict = {}
for file in csv_files:
    try:
        data_dict[file.stem] = pd.read_csv(file)
        print(f"Successfully loaded: {file.name}")
    except Exception as e:
        print(f"Error loading {file.name}: {str(e)}")

# Display the first few rows of each dataset
for name, df in data_dict.items():
    print(f"\nFirst few rows of {name}:")
    display(df.head())

In [None]:
# Basic data profiling for each dataset
for name, df in data_dict.items():
    print(f"\n=== Analysis for {name} dataset ===")
    print("\nDataset Shape:", df.shape)
    print("\nColumn Info:")
    print(df.info())
    print("\nSummary Statistics:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    # Visualizations
    plt.figure(figsize=(15, 10))
    
    # Missing values heatmap
    plt.subplot(2, 2, 1)
    sns.heatmap(df.isnull(), yticklabels=False, cmap='viridis', cbar=False)
    plt.title(f'Missing Values in {name}')
    
    # Correlation matrix for numerical columns
    plt.subplot(2, 2, 2)
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    if len(numerical_cols) > 0:
        sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm')
        plt.title(f'Correlation Matrix for {name}')
    
    # Distribution plots for numerical columns
    if len(numerical_cols) > 0:
        plt.subplot(2, 2, 3)
        for col in numerical_cols[:3]:  # Limit to first 3 columns
            sns.kdeplot(data=df[col], label=col)
        plt.title('Distribution of Numerical Features')
        plt.legend()
    
    # Bar plots for categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        plt.subplot(2, 2, 4)
        for col in categorical_cols[:1]:  # Plot first categorical column
            df[col].value_counts().plot(kind='bar')
            plt.title(f'Distribution of {col}')
            plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Display unique values in categorical columns
    if len(categorical_cols) > 0:
        print("\nUnique values in categorical columns:")
        for col in categorical_cols:
            print(f"\n{col}:")
            print(df[col].value_counts())

In [None]:
# Visualizing missing values for all datasets
def plot_missing_values(data_dictionary):
    # Calculate number of datasets for subplot layout
    n_datasets = len(data_dictionary)
    n_cols = min(2, n_datasets)
    n_rows = (n_datasets + 1) // 2
    
    # Create figure with subplots
    plt.figure(figsize=(12*n_cols, 6*n_rows))
    
    # Plot missing values for each dataset
    for idx, (name, df) in enumerate(data_dictionary.items(), 1):
        plt.subplot(n_rows, n_cols, idx)
        
        # Create heatmap
        sns.heatmap(df.isnull(), 
                   cbar=False, 
                   cmap='viridis',
                   yticklabels=False)
        
        # Calculate missing percentage
        missing_percent = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
        
        plt.title(f'Missing Values in {name}\n{missing_percent:.1f}% missing')
        plt.xlabel('Features')
    
    plt.tight_layout()
    plt.show()

# Call the function with our data dictionary
plot_missing_values(data_dict)

In [None]:
def plot_value_counts(data_dictionary, dataset_name, column_name):
    """
    Create a count plot for a specific column in a dataset.
    
    Args:
        data_dictionary (dict): Dictionary containing DataFrames
        dataset_name (str): Name of the dataset to plot
        column_name (str): Name of the column to plot
    """
    try:
        # Validate inputs
        if dataset_name not in data_dictionary:
            raise KeyError(f"Dataset '{dataset_name}' not found in data dictionary")
            
        df = data_dictionary[dataset_name]
        if column_name not in df.columns:
            raise KeyError(f"Column '{column_name}' not found in dataset '{dataset_name}'")
        
        # Create plot
        plt.figure(figsize=(12, 6))
        sns.countplot(x=column_name, data=df)
        
        # Customize plot
        plt.title(f'Distribution of {column_name} in {dataset_name}')
        plt.xlabel(column_name)
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
        
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error creating count plot: {str(e)}")

# Example usage:
# plot_value_counts(data_dict, 'dataset_name', 'column_name')