In [1]:
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

script_path = os.path.abspath(os.path.join(os.getcwd(), '../script'))
sys.path.append(script_path)

df_cleaned = pd.read_csv('../output/data_cleaning/valid_filtered_covid19_data.csv')

def summary_statistics(df):
    print("Summary Statistics:")
    print(df.describe())

def plot_histograms(df, columns):
    for column in columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(df[column], kde=True)
        plt.title(f'Distribution of {column}')
        
        os.makedirs('../output/eda', exist_ok=True)
        plt.savefig(f'../output/eda/{column}_histogram.png')
        plt.close()  

def plot_boxplots(df, columns):
    for column in columns:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=df[column])
        plt.title(f'Boxplot of {column}')
        os.makedirs('../output/eda', exist_ok=True)
        plt.savefig(f'../output/eda/{column}_boxplot.png')
        plt.close()  

def plot_correlation_heatmap(df):
    df_numeric = df.select_dtypes(include=[np.number])
    
    plt.figure(figsize=(10, 6))
    corr_matrix = df_numeric.corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')
    
    os.makedirs('../output/eda', exist_ok=True)
    plt.savefig('../output/eda/correlation_heatmap.png')
    plt.close()  


summary_statistics(df_cleaned)
plot_histograms(df_cleaned, ['confirmed_cases', 'deaths', 'recoveries'])
plot_boxplots(df_cleaned, ['deaths', 'recoveries'])
plot_correlation_heatmap(df_cleaned)


Summary Statistics:
       confirmed_cases       deaths   recoveries  total_confirmed  \
count      1922.000000  1922.000000  1922.000000      1922.000000   
mean        511.744017   256.876171   257.959417      5467.572320   
std         281.149122   219.846374   221.077053      2887.798545   
min           2.000000     0.000000     0.000000        22.000000   
25%         272.000000    73.000000    77.000000      3032.000000   
50%         507.500000   197.500000   199.000000      5454.500000   
75%         754.000000   389.750000   388.000000      7935.750000   
max        1000.000000   948.000000   991.000000     10906.000000   

       total_deaths  total_recoveries  
count   1922.000000       1922.000000  
mean    2781.543704       2768.957336  
std     1462.178395       1435.950822  
min       15.000000         23.000000  
25%     1516.500000       1539.500000  
50%     2796.000000       2767.000000  
75%     4012.750000       3978.750000  
max     5751.000000       5827.000000 