# COVID-19 Analysis
## Research Questions-1
1. No. of Cases in Each Countries 
2. No. of New Cases Everyday 
3. No. of New Cases Outside China Everyday 
4. No. of New Cases Outside China 
5. No. of Places to Which COVID-19 Spread
6. Time Evolution 
7. China VS Others Country
8. Recovery and Mortality Rate Over Time 
9. Proportion of Cases 
10. Confirmed / Recovered / Deaths Cases in Each Country
11. New Cases in Each Country 
12. Map of Spreadness COVID-19 Around the World!


## Research Questions-2
1. Growth: the doubling time of COVID-19 cases
2. Symptoms and disease progression of COVID-19
3. How long is the incubation period of COVID-19?
4. Measuring and interpreting the case fatality rate
5. Global case fatality rate of COVID-19
6. Case fatality rate of COVID-19 by age
7. Case fatality rate of COVID-19 by preexisting health conditions
8. Case fatality rate of COVID-19 compared to other diseases


In [1]:
# Data storing and analysis 
import numpy as np 
import pandas as pd 

# Visualizations 
import matplotlib.pyplot as plt 
import matplotlib.dates as mdates
import seaborn as sns 

# Grammer of graphics 
from plotnine import * 
import calmap 

# Interactive Visualizations 
import plotly.express as px 
import folium 

# Date Time 
from datetime import datetime, timedelta

# color pallette
cnf = '#393e46' # confirmed - grey
dth = '#ff2e63' # death - red
rec = '#21bf73' # recovered - cyan
act = '#fe9801' # active case - yellow

## Data Import 

In [2]:
# Load cleaned dataset 
df = pd.read_csv("data/cleaned_data/covid-19_cleaned_data.csv")

## Data Exploring 

In [3]:
# Check 
df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,Thailand,Thailand,15.0,101.0,2020-01-22,2,0,0
1,Japan,Japan,36.0,138.0,2020-01-22,2,0,0
2,Singapore,Singapore,1.2833,103.8333,2020-01-22,0,0,0
3,Nepal,Nepal,28.1667,84.25,2020-01-22,0,0,0
4,Malaysia,Malaysia,2.5,112.5,2020-01-22,0,0,0


In [4]:
# Check shape of data 
df.shape 

(29707, 8)

In [5]:
# Printing variables 
df.columns

Index(['Province/State', 'Country/Region', 'Lat', 'Long', 'Date', 'Confirmed',
       'Deaths', 'Recovered'],
      dtype='object')

In [6]:
# Check variable type 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29707 entries, 0 to 29706
Data columns (total 8 columns):
Province/State    29707 non-null object
Country/Region    29707 non-null object
Lat               29707 non-null float64
Long              29707 non-null float64
Date              29707 non-null object
Confirmed         29707 non-null int64
Deaths            29707 non-null int64
Recovered         29707 non-null int64
dtypes: float64(2), int64(3), object(3)
memory usage: 1.8+ MB


In [7]:
# Summary statistics 
df.describe().astype(int)

Unnamed: 0,Lat,Long,Confirmed,Deaths,Recovered
count,29707,29707,29707,29707,29707
mean,30,-33,174,6,63
std,19,80,2587,118,1400
min,-41,-157,0,0,0
25%,24,-91,0,0,0
50%,37,-73,0,0,0
75%,42,22,1,0,0
max,71,178,67800,5476,59433


## Preprocessing

In [8]:
# Missing values 
df.isnull().sum()

Province/State    0
Country/Region    0
Lat               0
Long              0
Date              0
Confirmed         0
Deaths            0
Recovered         0
dtype: int64

In [9]:
# Cases 
cases = ["Confirmed", "Recovered", "Deaths", "Active"] 

# Calculate active cases 
df['Active'] = df["Confirmed"] - df["Recovered"] - df["Deaths"] 

# Replacing some columns 
df["Country/Region"] = df["Country/Region"].replace("Mainland China", "China")

In [10]:
# Take a look now! 
df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
0,Thailand,Thailand,15.0,101.0,2020-01-22,2,0,0,2
1,Japan,Japan,36.0,138.0,2020-01-22,2,0,0,2
2,Singapore,Singapore,1.2833,103.8333,2020-01-22,0,0,0,0
3,Nepal,Nepal,28.1667,84.25,2020-01-22,0,0,0,0
4,Malaysia,Malaysia,2.5,112.5,2020-01-22,0,0,0,0


## Derived Datasets 

In [11]:
# World wide Cases 
world = df.iloc[:, :]
# Cases in ships 
ship = df[df["Province/State"].str.contains("'Grand Princess") | df["Country/Region"].str.contains("Cruise Ship")] 
# Cases in China 
china = df[df["Country/Region"] == "China"]
# Excluding china 
ex_china = df[df["Country/Region"] != "China"]
# Italy 
italy = df[df["Country/Region"] == "Italy"]
# Bangladesh 
bd = df[df["Country/Region"] == "Bangladesh"] 

In [12]:
# Get latest data 
df_latest = df[df["Date"] == max(df["Date"])].reset_index() 
world_latest = df_latest.iloc[:, :] 
china_latest = df_latest[df_latest["Country/Region"] == "China"] 
ex_china_latest = df_latest[df_latest["Country/Region"] == "Italy"] 
italy_latest = df_latest[df_latest["Country/Region"] == "Italy"] 
bd_latest = df_latest[df_latest["Country/Region"] == "Bangladesh"]  

In [25]:
# Starting and Ending date 
start = df["Date"].min()
end = df["Date"].max()
print(f"Starting Date of Outbreak: {start}")
print(f"Current Date of Outbreak: {end}")

Starting Date of Outbreak: 2020-01-22
Current Date of Outbreak: 2020-03-22


## Grouping

In [64]:
# Grouping data 
df_latest_grouped = df_latest.groupby("Country/Region")["Confirmed", "Recovered", "Deaths", "Active"].sum().reset_index()
china_latest_grouped = china_latest.groupby("Province/State")["Confirmed", "Recovered", "Deaths", "Active"].sum().reset_index()
ex_china_latest_grouped = ex_china_latest.groupby("Province/State")["Confirmed", "Recovered", "Deaths", "Active"].sum().reset_index()
italy_latest_grouped = italy_latest.groupby("Province/State")["Confirmed", "Recovered", "Deaths", "Active"].sum().reset_index()
bd_latest_grouped = bd_latest.groupby("Province/State")["Confirmed", "Recovered", "Deaths", "Active"].sum().reset_index()

## Country Wise Data 

In [128]:
# Country wise confirmed report 
confirmed_report = df_latest_grouped.sort_values(by="Confirmed", ascending=True)
confirmed_report = df_latest_grouped.reset_index(drop=True)

In [129]:
# Country wise recovered report 
recovered_report = df_latest_grouped.sort_values(by="Recovered", ascending=True)
recovered_report = df_latest_grouped.reset_index(drop=True)

In [130]:
# Country wise deaths report 
deaths_report = df_latest_grouped.sort_values(by="Deaths", ascending=True)
deaths_report = df_latest_grouped.reset_index(drop=True)

In [None]:
# Country wise no case recovered report 
deaths_report = df_latest_grouped.sort_values(by="Deaths", ascending=True)
deaths_report = df_latest_grouped.reset_index(drop=True)

In [132]:
fig = px.bar(confirmed_report.sort_values('Confirmed', ascending=False).head(25).sort_values('Confirmed', ascending=True), 
             x="Confirmed", y="Country/Region", title='Confirmed Cases', text='Confirmed', orientation='h', 
             width=700, height=700, range_x = [0, max(confirmed_report['Confirmed'])+10000])

fig.update_traces(marker_color='#084177', opacity=0.8, textposition='outside') 
fig.show()

In [138]:
fig = px.bar(deaths_report.sort_values('Deaths', ascending=False).head(25).sort_values('Deaths', ascending=True), 
             x="Confirmed", y="Country/Region", title='Deaths Cases', text='Confirmed', orientation='h', 
             width=700, height=700, range_x = [0, max(deaths_report['Deaths'])+100000])

fig.update_traces(marker_color='#084177', opacity=0.8, textposition='outside') 
fig.show()

## Cases Over Time 

In [107]:
temp = df.groupby("Date")["Recovered", "Deaths", "Active"].sum().reset_index() 
# Unpivoting data 
temp = temp.melt(id_vars="Date", value_vars=["Recovered", "Deaths", "Active"], var_name="Cases", value_name="Count") 
# Create Graph 
fig = px.area(temp, x="Date", y="Count", color="Cases", 
             title="Cases Over Time", color_discrete_sequence=[rec, dth, act]) 
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

## Recovery and Mortality Rate Over Time 

In [103]:
temp = df.groupby("Date").sum().reset_index() 
# Calculate No. of Recovered to 100 Confirmed Cases 
temp["No. of Recovered to 100 Confirmed Cases"] = round(temp["Recovered"]/temp["Confirmed"], 3) * 100
# Calculate No. of Deaths to 100 Confirmed Cases 
temp["No. of Deaths to 100 Confirmed Cases"] = round(temp["Deaths"]/temp["Confirmed"], 3) * 100

# Unpivoting 
temp = temp.melt(id_vars = "Date", value_vars=["No. of Recovered to 100 Confirmed Cases", "No. of Deaths to 100 Confirmed Cases"], 
                                value_name="Value", var_name="Ratio") 

# Create Graph 
fig = px.line(temp, x="Date", y="Value", color='Ratio',
              title='Recovery and Mortality Rate Over The Time', color_discrete_sequence=[dth, rec])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

In [114]:
temp = df.groupby("Date").sum().reset_index() 
# Unpivoting 
temp = temp.melt(id_vars = "Date", value_vars=["Active", "Recovered"], 
                                value_name="Value", var_name="Cases") 

# Create Graph 
fig = px.line(temp, x="Date", y="Value", color='Cases',
              title='Active and Recovered Cases', color_discrete_sequence=[dth, rec])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

## Top 25 Countries 

In [120]:
fig = px.bar(df.sort_values('Confirmed', ascending=False).head(20).sort_values('Confirmed', ascending=True), 
             x="Confirmed", y="Country/Region", title='Confirmed Cases', text='Confirmed', orientation='h', 
             width=700, height=700, range_x = [0, max(df['Confirmed'])+10000])

fig.update_traces(marker_color='#084177', opacity=0.8, textposition='outside') 
fig.show()

In [38]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>