# Capstone Project Covid-19

## Problem Statement:
Given data about COVID-19 patients, write code to visualize the impact and
analyze the trend of rate of infection and recovery as well as make predictions
about the number of cases expected a week in future based on the current
trends.

## Dataset:
CSV and Excel files containing data about the number of COVID-19 confirmed
deaths and recovered patients both around the world and in India. Download Link

## Guidelines:

● Use pandas to accumulate data from multiple data files.

● Use plotly (visualization library) to create interactive visualizations.

● Use Facebook prophet library to make time series models.

● Visualize the prediction by combining these technologies.

In [None]:
# I am going to perform time series forecasting using various models, 
# such as AR, MA, ARIMA, SARIMAX, and others. 
# However, in this case, I will focus on using the Facebook Prophet model.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from datetime import timedelta
from prophet import Prophet

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv(r"C:\Users\ABHI RAI\Desktop\Project_30_Dec\Capstone Project (Covid 19)\covid_19_clean_complete.csv")
df.head()

In [None]:
df.shape

In [None]:
#Renaming the columns
df.rename(columns={"Date":"date",
                   "Province/State":"state",
                   "Country/Region":"country",
                   "Lat":"lat","Long":"long",
                   "Confirmed":"confirmed",
                   "Deaths":"deaths",
                   "Recovered":"recovered",
                   "WHO Region":"who",
                   "Active":"active"
                   },inplace = True)

In [None]:
df.head()

In [None]:
df["date"].max()

In [None]:
#Records from the last date
df[df["date"] == df["date"].max()]

In [None]:
top = df[df["date"] == df["date"].max()]

In [None]:
world = top.groupby("country")[["confirmed","active","deaths","recovered"]].sum().reset_index()
world.head()

In [None]:
#Total confirmed cases in all countries together 
total_cases = df.groupby("date")["confirmed"].sum().reset_index()
total_cases.head()

In [None]:
total_cases["date"] = pd.to_datetime(total_cases["date"])

In [None]:
#Trend of covid

plt.figure(figsize = (40,10))
ax = sns.pointplot(x = total_cases.date.dt.date,y = total_cases.confirmed, color = "r")
ax.set(xlabel= "Dates", ylabel = "Total Cases")

plt.xticks(rotation = 90, fontsize = 10)
plt.yticks(rotation = 90, fontsize = 15)

plt.xlabel("Dates", fontsize = 30)
plt.ylabel("Total Cases", fontsize = 30)

In [None]:
# Top 20 countries with highest number of actives cases for the latest data 27th july 2020

top_actives = top.groupby(by="country")["active"].sum().sort_values(ascending=False).head(20).reset_index()
top_actives

In [None]:
plt.figure(figsize=(15,10))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Total Cases",fontsize=30)
plt.ylabel("Country",fontsize=30)
plt.title("Top 20 Countries having most Active Cases",fontsize=30)

ax = sns.barplot(x=top_actives.active, y=top_actives.country)
ax.set(xlabel="Total Cases",ylabel="Country")

In [None]:
# Top 20 countries with highest number of Death cases for the latest data 27th july 2020

top_deaths = top.groupby(by="country")["deaths"].sum().sort_values(ascending=False).head(20).reset_index()
top_deaths

In [None]:
plt.figure(figsize=(15,10))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Total Cases",fontsize=30)
plt.ylabel("Country",fontsize=30)
plt.title("Top 20 Countries having most Death Cases",fontsize=30)
ax = sns.barplot(x=top_deaths.deaths, y=top_deaths.country)
ax.set(xlabel="Total Cases",ylabel="Country")

In [None]:
# Can also try with Recovered cases
# Number of people recoverd 
recovery = top.groupby(by="country")["recovered"].sum().sort_values(ascending=False).head(20).reset_index()
recovery

In [None]:
plt.figure(figsize=(15,10))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Total Cases",fontsize=30)
plt.ylabel("Country",fontsize=30)
plt.title("Top 20 Countries having most Recovered Cases",fontsize=30)
ax = sns.barplot(x=recovery.recovered, y=recovery.country)
ax.set(xlabel="Total Cases",ylabel="Country")

In [None]:
# US data
us = df[df.country == "US"]
us = us.groupby(by ="date")[["recovered","deaths","confirmed","active"]].sum().reset_index()
us

In [None]:
# # US data
# us = df[df.country == "US"]
# us = us.groupby(by ="date")[["recovered","deaths","confirmed","active"]].sum().reset_index()
# us = us.iloc[33:].reset_index().drop("index",axis=1)
# us

In [None]:
# Brazil
brazil = df[df.country == "Brazil"]
brazil = brazil.groupby(by = "date")[["recovered","deaths","confirmed","active"]].sum().reset_index()
brazil.head(50)

In [None]:
# United kingdom
uk = df[df.country == "United Kingdom"]
uk = uk.groupby(by ="date")[["recovered","deaths","confirmed","active"]].sum().reset_index()
uk

In [None]:
# India

india = df[df.country == "India"]
india= india.groupby(by ="date")[["recovered","deaths","confirmed","active"]].sum().reset_index()
india= india.iloc[9:].reset_index().drop("index",axis=1)
india

In [None]:
df["country"].unique()

In [None]:
# China

China = df[df.country == "China"]
China= india.groupby(by ="date")[["recovered","deaths","confirmed","active"]].sum().reset_index()
China

In [None]:
# Russia

Russia=df[df.country=='Russia']
Russia=Russia.groupby(by='date')[['recovered','deaths','confirmed','active']].sum().reset_index()
Russia=Russia.iloc[10:].reset_index().drop('index',axis=1)
Russia

In [None]:
# Confirmed Cases trend in top 5 Countries
plt.figure(figsize=(40,10))
plt.plot(brazil.index,brazil.confirmed,color="Blue")
plt.plot(us.index,us.confirmed,color="Pink")
plt.plot(Russia.index,Russia.confirmed,color="Green")
plt.plot(uk.index,uk.confirmed,color="black")
plt.plot(india.index,india.confirmed,color="Red")
plt.xlabel('No.of days',fontsize=20)
plt.ylabel('Confirmed cases',fontsize=20)
plt.title('Confirmed cases over time (Top 5 Countries)',fontsize=30)
plt.show()

# The US has the highest number of confirmed cases, and it's experiencing the most rapid increase in new cases.

In [None]:
# Death cases trend in top 5 countries
plt.figure(figsize=(40,10))
plt.plot(brazil.index,brazil.confirmed,color="Blue")
plt.plot(us.index,us.confirmed,color="Pink")
plt.plot(Russia.index,Russia.confirmed,color="Green")
plt.plot(uk.index,uk.confirmed,color="black")
plt.plot(india.index,india.confirmed,color="Red")
plt.xlabel('No.of days',fontsize=20)
plt.ylabel('Death cases',fontsize=20)
plt.title('Deaths cases over time (Top 5 Countries)',fontsize=30)
plt.show()

In [None]:
# Active cases trend in top 5 countries
plt.figure(figsize=(40,10))
plt.plot(brazil.index,brazil.confirmed,color="Blue")
plt.plot(us.index,us.confirmed,color="Pink")
plt.plot(Russia.index,Russia.confirmed,color="Green")
plt.plot(uk.index,uk.confirmed,color="black")
plt.plot(india.index,india.confirmed,color="Red")
plt.xlabel('No.of days',fontsize=20)
plt.ylabel('Active cases',fontsize=20)
plt.title('Active cases over time (Top 5 Countries)',fontsize=30)
plt.show()

In [None]:
# Recovered cases trend in top 5 countries
plt.figure(figsize=(40,10))
plt.plot(brazil.index,brazil.confirmed,color="Blue")
plt.plot(us.index,us.confirmed,color="Pink")
plt.plot(Russia.index,Russia.confirmed,color="Green")
plt.plot(uk.index,uk.confirmed,color="black")
plt.plot(india.index,india.confirmed,color="Red")
plt.xlabel('No.of days',fontsize=20)
plt.ylabel('Recovered cases',fontsize=20)
plt.title('Recovered cases over time (Top 5 Countries)',fontsize=30)
plt.show()

In [None]:
data = pd.read_csv(r"C:\Users\ABHI RAI\Desktop\Project_30_Dec\Capstone Project (Covid 19)\covid_19_clean_complete.csv")
data.head()

In [None]:
# Convert 'date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
# Sort data by date
data = data.sort_values(by='Date')

In [None]:
# Calculate daily new infections and recoveries
data['new_infections'] = data['Confirmed'].diff().fillna(0)
data['new_recoveries'] = data['Confirmed'].diff().fillna(0)

In [None]:
# Plot cumulative cases
fig1 = px.line(data, x='Date', y=['Confirmed', 'Recovered'], 
               labels={'value': 'Number of Cases', 'date': 'Date'}, 
               title='Cumulative COVID-19 Cases')
fig1.update_layout(legend_title_text='Metric')
fig1.show()

In [None]:
# Plot daily cases
fig2 = px.bar(data, x='Date', y=['Confirmed', 'Recovered'], 
              labels={'value': 'Number of Cases', 'date': 'Date'}, 
              title='Daily New Cases and Recoveries')
fig2.update_layout(legend_title_text='Metric', barmode='group')
fig2.show()

In [None]:
# Trend Analysis and Prediction
# Prepare data for linear regression
data['days_since_start'] = (data['Date'] - data['Date'].min()).dt.days
X = data[['days_since_start']]
y_confirmed = data['Confirmed']
y_recovered = data['Recovered']

In [None]:
# Linear regression models
model_confirmed = LinearRegression()
model_recovered = LinearRegression()

model_confirmed.fit(X, y_confirmed)
model_recovered.fit(X, y_recovered)

In [None]:
# Predict the number of cases for the next 7 days
future_dates = pd.date_range(start=data['Date'].max() + timedelta(days=1), 
                             periods=7)
future_days_since_start = (future_dates - data['Date'].min()).days

future_confirmed = model_confirmed.predict(future_days_since_start.values.reshape(-1, 1))
future_recovered = model_recovered.predict(future_days_since_start.values.reshape(-1, 1))

In [None]:
# Combine predictions into a DataFrame
predictions = pd.DataFrame({
    'date': future_dates,
    'predicted_confirmed': future_confirmed,
    'predicted_recovered': future_recovered
})

In [None]:
print(data.columns)

In [None]:
data.columns = data.columns.str.strip()

In [None]:
# Inspect column names and the first few rows of the dataset
print("Column Names:", data.columns)
print(data.head())

In [None]:
data['date'] = pd.to_datetime(data['Date'])

In [None]:
# Plot predictions
fig3 = go.Figure()
fig3.add_trace(go.Scatter(x=data['date'], y=data['Confirmed'], mode='lines', name='Actual Confirmed Cases'))
fig3.add_trace(go.Scatter(x=predictions['date'], y=predictions['predicted_confirmed'], mode='lines', name='Predicted Confirmed Cases', line=dict(dash='dot')))
fig3.add_trace(go.Scatter(x=data['date'], y=data['Recovered'], mode='lines', name='Actual Recovered Cases'))
fig3.add_trace(go.Scatter(x=predictions['date'], y=predictions['predicted_recovered'], mode='lines', name='Predicted Recovered Cases', line=dict(dash='dot')))

fig3.update_layout(title='COVID-19 Cases: Actual vs Predicted',
                   xaxis_title='Date', yaxis_title='Number of Cases')
fig3.show()

In [None]:
fbp = pd.read_csv(r"C:\Users\ABHI RAI\Desktop\Project_30_Dec\Capstone Project (Covid 19)\covid_19_clean_complete.csv")
fbp.head()

In [None]:
# To check how many total active cases are there

fbp["Active"].sum()

In [None]:
fbp['Confirmed'].sum()

In [None]:
confirmed = fbp.groupby("Date").sum()["Confirmed"].reset_index()
confirmed

In [None]:
confirmed.sum()

In [None]:
deaths = fbp.groupby("Date").sum()["Deaths"].reset_index()
deaths

In [None]:
recovered = fbp.groupby("Date").sum()["Recovered"].reset_index()
recovered

In [None]:
# For building a forecasting model using FB prophet library
# there should be only 2 columns passed
# the column names should always be ---> 'ds','y'

confirmed.columns = ["ds", "y"]  #ds ---> date stamp, y ----> target
confirmed["ds"] = pd.to_datetime(confirmed['ds'])
confirmed.tail()

In [None]:
deaths.columns=['ds','y']
deaths['ds']=pd.to_datetime(deaths['ds'])
deaths.tail()

In [None]:
recovered.columns=['ds','y']
recovered['ds']=pd.to_datetime(recovered['ds'])
recovered.tail()

## Forecasting confirmed cases:

In [None]:
# Eg: import sklear-----
# ld = LinearRegression()

m = Prophet()
m.fit(confirmed)

In [None]:
future = m.make_future_dataframe(periods = 30) #this will forecast the data for next 30 days
future

In [None]:
future.tail(10)

In [None]:
forecast = m.predict(future)
forecast[["ds","yhat","yhat_lower","yhat_upper"]].tail()

In [None]:
confirmed_forecasting_plot=m.plot(forecast)

In [None]:
confirmed_forecasting_plot1=m.plot_components(forecast)

# Thank You