In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Importing and viewing the dataseet

In [20]:
# open up dataset using panda and view it 
df = pd.read_csv('covid_data.csv')
df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active,WHO Region
0,,Afghanistan,33.93911,67.709953,2020-01-22,0,0,0,0,Eastern Mediterranean
1,,Albania,41.1533,20.1683,2020-01-22,0,0,0,0,Europe
2,,Algeria,28.0339,1.6596,2020-01-22,0,0,0,0,Africa
3,,Andorra,42.5063,1.5218,2020-01-22,0,0,0,0,Europe
4,,Angola,-11.2027,17.8739,2020-01-22,0,0,0,0,Africa


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49068 entries, 0 to 49067
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  14664 non-null  object 
 1   Country/Region  49068 non-null  object 
 2   Lat             49068 non-null  float64
 3   Long            49068 non-null  float64
 4   Date            49068 non-null  object 
 5   Confirmed       49068 non-null  int64  
 6   Deaths          49068 non-null  int64  
 7   Recovered       49068 non-null  int64  
 8   Active          49068 non-null  int64  
 9   WHO Region      49068 non-null  object 
dtypes: float64(2), int64(4), object(4)
memory usage: 3.7+ MB


### Cleaning the dataset

#### Handling duplicate values

In [23]:
boolean = False  # Tracks if any duplicates exist across columns
for col in df.columns:
    is_duplicate = df.duplicated(subset=[col]).any()  # Check for duplicates in the current column
    print(f"{col} : {is_duplicate}")  # Print the result for the current column
    if is_duplicate:
        boolean = True  # Update the boolean if any duplicates are found

if not boolean:  # If boolean is still False, no duplicates were found
    print("There are no duplicate entries in the dataset.")
else:
    print("Duplicates were found in the dataset.")


Province/State : True
Country/Region : True
Lat : True
Long : True
Date : True
Confirmed : True
Deaths : True
Recovered : True
Active : True
WHO Region : True
Duplicates were found in the dataset.


In [27]:
df['Country/Region'].nunique()

187

In [29]:
df['WHO Region'].unique()

array(['Eastern Mediterranean', 'Europe', 'Africa', 'Americas',
       'Western Pacific', 'South-East Asia'], dtype=object)

#### Handling missing data

In [31]:
# df = data.copy()
missing_values = df.isnull().sum()
total_missing_values = df.drop('Province/State', axis=1).isnull().sum().sum()
print(f'Missing values per column:\n{missing_values}')
print('Total missing data:', total_missing_values)
missing_val_percent=(total_missing_values/df.shape[0])*100
print(f'total percentage of missing value is {missing_val_percent:.2f}%')

Missing values per column:
Province/State    34404
Country/Region        0
Lat                   0
Long                  0
Date                  0
Confirmed             0
Deaths                0
Recovered             0
Active                0
WHO Region            0
dtype: int64
Total missing data: 0
total percentage of missing value is 0.00%


In [7]:
# Categorical columns
cat_col = [col for col in df.columns if df[col].dtype == 'object']
# Numerical columns
num_col = [col for col in df.columns if df[col].dtype != 'object']

print('Categorical columns :',cat_col)
print('Numerical columns :',num_col)

Categorical columns : ['Province/State', 'Country/Region', 'Date', 'WHO Region']
Numerical columns : ['Lat', 'Long', 'Confirmed', 'Deaths', 'Recovered', 'Active']


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# OneHotEncode multi-category columns without changing column names
multi_cat_cols = [col for col in cat_col if df[col].nunique() > 2]
if multi_cat_cols:
    onehot = OneHotEncoder(sparse_output=False)
    onehot_encoded = onehot.fit_transform(df[multi_cat_cols])
    
    # Retrieve new column names for encoded categories
    onehot_columns = onehot.get_feature_names_out(multi_cat_cols)
    # Combine the rest of the data with encoded categories
    df = pd.concat(
        [pd.DataFrame(onehot_encoded, columns=onehot_columns, index=df.index), 
         df.drop(columns=multi_cat_cols)], axis=1
    )
    

In [15]:
df.columns

Index(['Province/State_Alberta', 'Province/State_Anguilla',
       'Province/State_Anhui', 'Province/State_Aruba',
       'Province/State_Australian Capital Territory', 'Province/State_Beijing',
       'Province/State_Bermuda', 'Province/State_British Columbia',
       'Province/State_British Virgin Islands',
       'Province/State_Cayman Islands',
       ...
       'WHO Region_Eastern Mediterranean', 'WHO Region_Europe',
       'WHO Region_South-East Asia', 'WHO Region_Western Pacific', 'Lat',
       'Long', 'Confirmed', 'Deaths', 'Recovered', 'Active'],
      dtype='object', length=466)