In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_and_explore_data(file_path):
    """
    Load healthcare data and perform initial exploration
    
    Args:
        file_path (str): Path to the data file
        
    Returns:
        pandas.DataFrame: Loaded data
    """
    print("üìä Loading and Exploring Hospital Readmission Data")
    print("=" * 50)
    
    # Load data
    try:
        data = pd.read_csv(file_path)
        print(f"‚úÖ Data loaded successfully: {data.shape[0]} rows, {data.shape[1]} columns")
    except FileNotFoundError:
        print(f"‚ùå File not found: {file_path}")
        return None
    
    # Basic information
    print("\nüìà Dataset Overview:")
    print(f"Shape: {data.shape}")
    print(f"Memory usage: {data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Data types
    print("\nüîß Data Types:")
    print(data.dtypes.value_counts())
    
    # Missing values analysis
    print("\n‚ùì Missing Values Analysis:")
    missing_info = pd.DataFrame({
        'Missing Count': data.isnull().sum(),
        'Missing Percentage': (data.isnull().sum() / len(data)) * 100
    }).sort_values('Missing Count', ascending=False)
    
    # Display columns with missing values
    missing_columns = missing_info[missing_info['Missing Count'] > 0]
    if len(missing_columns) > 0:
        print(f"Columns with missing values: {len(missing_columns)}")
        print(missing_columns.head(10))
    else:
        print("‚úÖ No missing values found!")
    
    return data

In [None]:
def analyze_target_variable(data, target_column='readmission_risk'):
    """
    Analyze the distribution of the target variable
    
    Args:
        data (pd.DataFrame): Input data
        target_column (str): Name of the target column
    """
    if target_column not in data.columns:
        print(f"‚ùå Target column '{target_column}' not found in data")
        return
    
    print(f"\nüéØ Target Variable Analysis: {target_column}")
    print("=" * 40)
    
    # Distribution
    target_distribution = data[target_column].value_counts()
    print("Distribution:")
    print(target_distribution)
    
    # Visualization
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    target_distribution.plot(kind='bar', color=['skyblue', 'salmon'])
    plt.title('Readmission Risk Distribution')
    plt.xlabel('Readmission Risk')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    plt.subplot(1, 2, 2)
    plt.pie(target_distribution.values, labels=target_distribution.index, 
            autopct='%1.1f%%', colors=['lightgreen', 'lightcoral'])
    plt.title('Readmission Risk Proportion')
    
    plt.tight_layout()
    plt.show()
    
    # Class imbalance check
    imbalance_ratio = target_distribution.min() / target_distribution.max()
    print(f"\n‚öñÔ∏è Class Imbalance Ratio: {imbalance_ratio:.3f}")
    if imbalance_ratio < 0.5:
        print("‚ö†Ô∏è Significant class imbalance detected - consider resampling techniques")


In [None]:
def analyze_numerical_features(data, numerical_columns):
    """
    Analyze numerical features with statistics and visualizations
    
    Args:
        data (pd.DataFrame): Input data
        numerical_columns (list): List of numerical column names
    """
    print(f"\nüî¢ Numerical Features Analysis")
    print("=" * 40)
    
    if not numerical_columns:
        print("‚ùå No numerical columns provided")
        return
    
    # Basic statistics
    print("Descriptive Statistics:")
    print(data[numerical_columns].describe())
    
    # Visualization
    n_cols = min(4, len(numerical_columns))
    n_rows = (len(numerical_columns) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for i, col in enumerate(numerical_columns):
        if i < len(axes):
            data[col].hist(bins=30, ax=axes[i], alpha=0.7, color='skyblue')
            axes[i].set_title(f'Distribution of {col}')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
    
    # Hide empty subplots
    for i in range(len(numerical_columns), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()
    
    # Correlation analysis
    if len(numerical_columns) > 1:
        print("\nüìä Correlation Matrix:")
        plt.figure(figsize=(10, 8))
        correlation_matrix = data[numerical_columns].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                   square=True, fmt='.2f')
        plt.title('Correlation Matrix of Numerical Features')
        plt.tight_layout()
        plt.show()


In [None]:
def analyze_categorical_features(data, categorical_columns, target_column=None):
    """
    Analyze categorical features with counts and relationships
    
    Args:
        data (pd.DataFrame): Input data
        categorical_columns (list): List of categorical column names
        target_column (str): Optional target column for relationship analysis
    """
    print(f"\nüìã Categorical Features Analysis")
    print("=" * 45)
    
    if not categorical_columns:
        print("‚ùå No categorical columns provided")
        return
    
    for col in categorical_columns:
        if col not in data.columns:
            continue
            
        print(f"\nüìà Analysis of '{col}':")
        value_counts = data[col].value_counts()
        print(f"Unique values: {data[col].nunique()}")
        print("Top 10 values:")
        print(value_counts.head(10))
        
        # Visualization
        plt.figure(figsize=(12, 5))
        
        # Value counts plot
        plt.subplot(1, 2, 1)
        value_counts.head(10).plot(kind='bar', color='lightseagreen')
        plt.title(f'Top 10 Values in {col}')
        plt.xticks(rotation=45)
        
        # Relationship with target (if provided)
        if target_column and target_column in data.columns:
            plt.subplot(1, 2, 2)
            cross_tab = pd.crosstab(data[col], data[target_column], normalize='index')
            cross_tab.plot(kind='bar', stacked=True, ax=plt.gca(),
                          color=['lightcoral', 'lightgreen'])
            plt.title(f'{col} vs {target_column}')
            plt.xticks(rotation=45)
            plt.legend(title=target_column)
        
        plt.tight_layout()
        plt.show()


In [None]:
def generate_data_quality_report(data):
    """
    Generate a comprehensive data quality report
    
    Args:
        data (pd.DataFrame): Input data
        
    Returns:
        pd.DataFrame: Data quality report
    """
    print("üìã Generating Comprehensive Data Quality Report")
    print("=" * 50)
    
    quality_report = pd.DataFrame(index=data.columns)
    
    # Basic metrics
    quality_report['Data Type'] = data.dtypes
    quality_report['Non-Null Count'] = data.count()
    quality_report['Null Count'] = data.isnull().sum()
    quality_report['Null Percentage'] = (data.isnull().sum() / len(data)) * 100
    quality_report['Unique Values'] = data.nunique()
    quality_report['Duplicate Rows'] = data.duplicated().sum()
    
    # For numerical columns
    numerical_cols = data.select_dtypes(include=[np.number]).columns
    for col in numerical_cols:
        quality_report.loc[col, 'Mean'] = data[col].mean()
        quality_report.loc[col, 'Median'] = data[col].median()
        quality_report.loc[col, 'Std Dev'] = data[col].std()
        quality_report.loc[col, 'Min'] = data[col].min()
        quality_report.loc[col, 'Max'] = data[col].max()
    
    print(f"‚úÖ Data Quality Report Generated")
    print(f"üìä Dataset Shape: {data.shape}")
    print(f"‚ùì Total Missing Values: {quality_report['Null Count'].sum()}")
    print(f"üîÑ Duplicate Rows: {quality_report['Duplicate Rows'].max()}")
    
    return quality_report

# Example usage in the notebook
if __name__ == "__main__":
    # Load data
    data = load_and_explore_data('../data/sample_data.csv')
    
    if data is not None:
        # Generate quality report
        quality_report = generate_data_quality_report(data)
        
        # Identify numerical and categorical columns
        numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
        
        print(f"\nüî¢ Numerical columns: {len(numerical_cols)}")
        print(f"üìã Categorical columns: {len(categorical_cols)}")
        
        # Analyze target variable
        analyze_target_variable(data)
        
        # Analyze numerical features
        if numerical_cols:
            analyze_numerical_features(data, numerical_cols)
        
        # Analyze categorical features
        if categorical_cols:
            analyze_categorical_features(data, categorical_cols, 'readmission_risk')