In [None]:
! uv pip install --upgrade ipykernel

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set styling for better-looking plots
plt.style.use('seaborn-v0_8-white')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['font.size'] = 12

# Create output directory for plots
output_dir = f"eda_plots_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(output_dir, exist_ok=True)

def save_plot(name):
    """Helper function to save plots with consistent naming"""
    plt.savefig(f"{output_dir}/{name}.png", bbox_inches='tight', dpi=300)
    plt.close()

def load_data():
    """Load all dataframes and perform initial preprocessing"""
    df1 = pd.read_csv('test_1.csv')
    df2_1 = pd.read_csv('test_2_1.csv')
    df2_2 = pd.read_csv('test_2_2.csv')
    
    return df1, df2_1, df2_2

def plot_missing_values_percentage(df, title="Missing Values Percentage"):
    """Create a bar plot of missing values percentage by column"""
    # Calculate percentage of missing values for each column
    missing_percentages = (df.isnull().sum() / len(df) * 100).sort_values(ascending=True)
    
    # Create bar plot
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x=missing_percentages.values, y=missing_percentages.index)
    
    # Add percentage labels on the bars
    for i, v in enumerate(missing_percentages.values):
        ax.text(v + 0.5, i, f'{v:.1f}%', va='center')
    
    plt.title(title)
    plt.xlabel('Percentage of Missing Values')
    plt.ylabel('Columns')
    
    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    save_plot(f"missing_values_percentage_{title.lower().replace(' ', '_')}")


def generate_basic_stats(df, title="Dataset Statistics"):
    """Generate basic statistics for a dataframe and save to file"""
    stats_output = []
    stats_output.append(f"\n{title}")
    stats_output.append("-" * 50)
    
    # Basic info
    stats_output.append(f"\nDataset Shape: {df.shape}")
    stats_output.append(f"\nColumns: {df.columns.tolist()}")
    
    # Missing values
    missing = df.isnull().sum()
    if missing.any():
        stats_output.append("\nMissing Values:")
        stats_output.append(str(missing[missing > 0]))
    
    # Data types
    stats_output.append("\nData Types:")
    stats_output.append(str(df.dtypes))
    
    # Basic statistics
    stats_output.append("\nNumerical Statistics:")
    stats_output.append(str(df.describe()))
    
    # Save to file
    with open(f"{output_dir}/{title.lower().replace(' ', '_')}.txt", 'w') as f:
        f.write('\n'.join(stats_output))

def plot_missing_values(df, title="Missing Values Heatmap"):
    """Create a heatmap of missing values and save to file"""
    plt.figure(figsize=(15, 8))
    sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
    plt.title(title)
    plt.tight_layout()
    save_plot(f"missing_values_{title.lower().replace(' ', '_')}")

def plot_correlations(df, title="Correlation Matrix"):
    """Create a correlation matrix using seaborn and save to file"""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    corr_matrix = df[numeric_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, 
                cmap='RdBu', 
                vmin=-1, 
                vmax=1, 
                center=0,
                fmt='.2f')
    plt.title(title)
    plt.tight_layout()
    save_plot(f"correlation_{title.lower().replace(' ', '_')}")

def plot_distribution(df, column, title=None):
    """Create distribution plots for numerical columns and save to file"""
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # Histogram with KDE
    sns.histplot(data=df, x=column, kde=True, ax=ax1)
    ax1.set_title('Distribution')
    
    # Box plot
    sns.boxplot(data=df, x=column, ax=ax2)
    ax2.set_title('Box Plot')
    
    plt.tight_layout()
    save_plot(f"distribution_{column.lower().replace(' ', '_')}")

def plot_categorical_analysis(df, column, title=None):
    """Create analysis plots for categorical columns and save to file"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Calculate value counts and percentages
    value_counts = df[column].value_counts()
    
    # Bar plot
    sns.barplot(x=value_counts.index, y=value_counts.values, ax=ax1)
    ax1.set_title('Value Counts')
    ax1.tick_labels = plt.setp(ax1.get_xticklabels(), rotation=45, ha='right')
    
    # Pie chart
    ax2.pie(value_counts.values, 
            labels=value_counts.index, 
            autopct='%1.1f%%',
            startangle=90)
    ax2.set_title('Percentage Distribution')
    
    plt.tight_layout()
    save_plot(f"categorical_{column.lower().replace(' ', '_')}")

def plot_time_series(df, date_column, value_column, title=None):
    """Create time series analysis plots and save to file"""
    # Convert date column to datetime if needed
    df[date_column] = pd.to_datetime(df[date_column])
    
    plt.figure(figsize=(15, 6))
    sns.lineplot(data=df, x=date_column, y=value_column)
    plt.title(f'Time Series Analysis: {value_column}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    save_plot(f"timeseries_{value_column.lower().replace(' ', '_')}")

def identify_outliers(df, column):
    """Identify outliers using IQR method and save to file"""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    outliers = df[(df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))][column]
    
    output = []
    output.append(f"\nOutliers in {column}:")
    output.append(f"Number of outliers: {len(outliers)}")
    output.append(f"Percentage of outliers: {(len(outliers)/len(df[column])*100):.2f}%")
    output.append("\nOutlier values:")
    output.append(str(outliers.describe()))
    
    # Save to file
    with open(f"{output_dir}/outliers_{column.lower().replace(' ', '_')}.txt", 'w') as f:
        f.write('\n'.join(output))

def main():
    # Load data
    df1, df2_1, df2_2 = load_data()
    
    # Analyze each dataframe
    for idx, df in enumerate([df1, df2_1, df2_2], 1):
        print(f"\nAnalyzing DataFrame {idx}...")
        
        # Generate basic statistics
        generate_basic_stats(df, f"DataFrame_{idx}_Statistics")
        
        # Plot missing values
        plot_missing_values(df, f"DataFrame_{idx}")
        plot_missing_values_percentage(df, f"DataFrame_{idx}")
        
        # Plot correlation matrix
        plot_correlations(df, f"DataFrame_{idx}")
        
        # Analyze numerical columns
        # numerical_cols = df.select_dtypes(include=[np.number]).columns
        # for col in numerical_cols:
        #     plot_distribution(df, col)
        #     identify_outliers(df, col)
        
        # Analyze categorical columns
        # categorical_cols = df.select_dtypes(include=['object']).columns
        # for col in categorical_cols:
        #     plot_categorical_analysis(df, col)
        
        # # Time series analysis (if applicable)
        # date_cols = df.select_dtypes(include=['datetime64']).columns
        # if len(date_cols) > 0:
        #     for date_col in date_cols:
        #         for num_col in numerical_cols:
        #             plot_time_series(df, date_col, num_col)
    
    print(f"\nAnalysis complete! All plots and statistics have been saved to: {output_dir}")

if __name__ == "__main__":
    main()

In [None]:
print(plt.style.available)
