In [None]:
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

script_path = os.path.abspath(os.path.join(os.getcwd(), '../script'))
sys.path.append(script_path)

df_cleaned = pd.read_csv('../output/data_cleaning/valid_filtered_covid19_data.csv')

def summary_statistics(df):
    print("Summary Statistics:")
    print(df.describe())

def plot_histograms(df, columns):
    for column in columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(df[column], kde=True)
        plt.title(f'Distribution of {column}')
        
        os.makedirs('../output/eda', exist_ok=True)
        plt.savefig(f'../output/eda/{column}_histogram.png')
        plt.close()  

def plot_boxplots(df, columns):
    for column in columns:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=df[column])
        plt.title(f'Boxplot of {column}')
        os.makedirs('../output/eda', exist_ok=True)
        plt.savefig(f'../output/eda/{column}_boxplot.png')
        plt.close()  

def plot_correlation_heatmap(df):
    df_numeric = df.select_dtypes(include=[np.number])
    
    plt.figure(figsize=(10, 6))
    corr_matrix = df_numeric.corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')
    
    os.makedirs('../output/eda', exist_ok=True)
    plt.savefig('../output/eda/correlation_heatmap.png')
    plt.close()  


summary_statistics(df_cleaned)
plot_histograms(df_cleaned, ['confirmed_cases', 'deaths', 'recoveries'])
plot_boxplots(df_cleaned, ['deaths', 'recoveries'])
plot_correlation_heatmap(df_cleaned)
