In [6]:
import pandas as pd
import numpy as np
import datetime

class CovidDataAnalyzer:
    def __init__(self, file_path):
        self.data = file_path

    def load_data(self):
        return pd.read_csv(self.data)

    def describe_data(self):
        print(f'Shape of the data: {self.data.shape}\n')
        print(f'The columns of the data are: \n{self.data.columns}\n')
        print(f'Basic statistics of the data: \n{self.data.describe()}')
        print(f'Mean number of recovered by region is {round(self.data.describe()['Recovered']['mean'])}')
        data_deaths = self.data[self.data['Deaths'] == self.data['Deaths'].max()]
        print(f'The highest number of deaths is {self.data['Deaths'].max()} and recorded in {data_deaths['WHO Region'].to_string(index=False)}.\n')

    def handle_missing_values(self):
        num_col = self.data.select_dtypes(['number']).columns
        self.data[num_col] = self.data[num_col].fillna(value='None')
        obj_col = self.data.select_dtypes(['object']).columns
        self.data[obj_col] = self.data[obj_col].fillna(value='Unknown')

    def filter_high_cases(self):
        self.filtered_data = self.data[(self.data['Confirmed'] > 100000) & (self.data['Deaths'] > 5000) & ~(self.data['Country/Region'] == 'Unknown')]

    def filter_by_date_range (self, start_date, end_date):
        try:
            datetime.datetime.strptime(start_date, '%Y-%m-%d')
            datetime.datetime.strptime(end_date, '%Y-%m-%d')
            self.filtered_data = self.data.loc[(self.data['Date'] >= start_date) & (self.data['Date'] < end_date)]
        except ValueError:
            print("Incorrect data format, should be YYYY-MM-DD")

    def calculate_global_statistics(self):
        global_counts = self.data.loc[:, ['Confirmed', 'Deaths', 'Recovered']]
        global_counts = {col: np.sum(global_counts[col]) for col in global_counts.columns}
        print(f'Total confirmed cases of covid-19: {global_counts['Confirmed']}')
        print(f'Total deaths from covid-19: {global_counts['Deaths']}')
        print(f'Total recovered from covid: {global_counts['Recovered']}')

    def save_filtered_data(self, filename):
        self.filtered_data.to_csv(filename)

analyzer = CovidDataAnalyzer(file_path='Datasets/covid_19_data.csv')
analyzer.data = analyzer.load_data()
analyzer.describe_data()
analyzer.handle_missing_values()
analyzer.filter_high_cases()
analyzer.save_filtered_data('Datasets/covid_19_high_cases.csv')
analyzer.filter_by_date_range (start_date='2020-03-01', end_date='2020-06-30')
analyzer.save_filtered_data('Datasets/covid_19_filter_by_date.csv')
analyzer.calculate_global_statistics()

Shape of the data: (49068, 10)

The columns of the data are: 
Index(['Province/State', 'Country/Region', 'Lat', 'Long', 'Date', 'Confirmed',
       'Deaths', 'Recovered', 'Active', 'WHO Region'],
      dtype='object')

Basic statistics of the data: 
                Lat          Long     Confirmed         Deaths     Recovered  \
count  49068.000000  49068.000000  4.906800e+04   49068.000000  4.906800e+04   
mean      21.433730     23.528236  1.688490e+04     884.179160  7.915713e+03   
std       24.950320     70.442740  1.273002e+05    6313.584411  5.480092e+04   
min      -51.796300   -135.000000  0.000000e+00       0.000000  0.000000e+00   
25%        7.873054    -15.310100  4.000000e+00       0.000000  0.000000e+00   
50%       23.634500     21.745300  1.680000e+02       2.000000  2.900000e+01   
75%       41.204380     80.771797  1.518250e+03      30.000000  6.660000e+02   
max       71.706900    178.065000  4.290259e+06  148011.000000  1.846641e+06   

             Active  
count  