# Libraries

In [1]:
# import libraries
# ================

# for date and time opeations
from datetime import datetime
# for file and folder operations
import os
# for regular expression opeations
import re
# for listing files in a folder
import glob
# for getting web contents
import requests 
# storing and analysing data
import pandas as pd
# for scraping web contents
from bs4 import BeautifulSoup
import re

# Web Scrapping

In [2]:
# get data
# ========

# link at which web data recides
link = 'https://www.mohfw.gov.in/'
# get web data
req = requests.get(link)
# parse web data
soup = BeautifulSoup(req.content, "html.parser")

In [3]:
# find the table
# ==============
# our target table is the last table in the page

# get the table head
# table head may contain the column names, titles, subtitles
thead = soup.find_all('thead')[-1]
# print(thead)

# get all the rows in table head
# it usually have only one row, which has the column names
head = thead.find_all('tr')
# print(head)

# get the table tbody
# it contains the contents
tbody = soup.find_all('tbody')[-1]
# print(tbody)

# get all the rows in table body
# each row is each state's entry
body = tbody.find_all('tr')
# print(body)

In [4]:
# get the table contents
# ======================

# container for header rows / column title
head_rows = []
# container for table body / contents
body_rows = []

# loop through the head and append each row to head
for tr in head:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    head_rows.append(row)
# print(head_rows)

# loop through the body and append each row to body
for tr in body:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    body_rows.append(row)
# print(head_rows)

In [5]:
# save contents in a dataframe
# ============================
    
# skip last 3 rows, it contains unwanted info
# head_rows contains column title
df_bs = pd.DataFrame(body_rows[:len(body_rows)-6], 
                     columns=head_rows[0])         

# Drop 'S. No.' column
df_bs.drop('S. No.', axis=1, inplace=True)

# there are 36 states+UT in India
df_bs.head(36)

Unnamed: 0,Name of State / UT,Active Cases*,Cured/Discharged/Migrated*,Deaths**,Total Confirmed cases*
0,Andaman and Nicobar Islands,0,33,0,33
1,Andhra Pradesh,1951,2682,75,4708
2,Arunachal Pradesh,50,1,0,51
3,Assam,1946,615,4,2565
4,Bihar,2578,2480,30,5088
5,Chandigarh,36,273,5,314
6,Chhattisgarh,786,283,4,1073
7,Dadar Nagar Haveli,18,2,0,20
8,Delhi,17125,10999,812,28936
9,Goa,235,65,0,300


# Data Cleaning

In [6]:
# date-time information
# =====================

# today's date
now  = datetime.now()
# format date to month-day-year
df_bs['Date'] = now.strftime("%m/%d/%Y") 

# add 'Date' column to dataframe
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')

# df_bs.head(36)

In [7]:
# remove extra characters from 'Name of State/UT' column
df_bs['Name of State / UT'] = df_bs['Name of State / UT'].str.replace('#', '')

In [8]:
# latitude and longitude information
# ==================================

# latitude of the states
lat = {'Delhi':28.7041, 'Haryana':29.0588, 'Kerala':10.8505, 'Rajasthan':27.0238,
       'Telengana':18.1124, 'Uttar Pradesh':26.8467, 'Ladakh':34.2996, 'Tamil Nadu':11.1271,
       'Jammu and Kashmir':33.7782, 'Punjab':31.1471, 'Karnataka':15.3173, 'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129, 'Odisha':20.9517, 'Uttarakhand':30.0668, 'West Bengal':22.9868, 
       'Puducherry': 11.9416, 'Chandigarh': 30.7333, 'Chhattisgarh':21.2787, 'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 'Madhya Pradesh': 22.9734, 'Bihar': 25.0961, 'Manipur':24.6637, 
       'Mizoram':23.1645, 'Goa': 15.2993, 'Andaman and Nicobar Islands': 11.7401, 'Assam' : 26.2006, 
       'Jharkhand': 23.6102, 'Arunachal Pradesh': 28.2180, 'Tripura': 23.9408, 'Nagaland': 26.1584, 
       'Meghalaya' : 25.4670, 'Dadar Nagar Haveli' : 20.1809, 'Sikkim':27.5330}

# longitude of the states
long = {'Delhi':77.1025, 'Haryana':76.0856, 'Kerala':76.2711, 'Rajasthan':74.2179,
        'Telengana':79.0193, 'Uttar Pradesh':80.9462, 'Ladakh':78.2932, 'Tamil Nadu':78.6569,
        'Jammu and Kashmir':76.5762, 'Punjab':75.3412, 'Karnataka':75.7139, 'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400, 'Odisha':85.0985, 'Uttarakhand':79.0193, 'West Bengal':87.8550, 
        'Puducherry': 79.8083, 'Chandigarh': 76.7794, 'Chhattisgarh':81.8661, 'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 'Madhya Pradesh': 78.6569, 'Bihar': 85.3131, 'Manipur':93.9063, 
        'Mizoram':92.9376, 'Goa': 74.1240, 'Andaman and Nicobar Islands': 92.6586, 'Assam' : 92.9376, 
        'Jharkhand': 85.2799, 'Arunachal Pradesh': 94.7278, 'Tripura': 91.9882, 'Nagaland': 94.5624,
        'Meghalaya' : 91.3662, 'Dadar Nagar Haveli' : 73.0169, 'Sikkim':88.5122}

# add latitude column based on 'Name of State / UT' column
df_bs['Latitude'] = df_bs['Name of State / UT'].map(lat)

# add longitude column based on 'Name of State / UT' column
df_bs['Longitude'] = df_bs['Name of State / UT'].map(long)

# df_bs.head(36)

In [9]:
# unique state names
df_bs['Name of State / UT'].unique()

array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadar Nagar Haveli', 'Delhi', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand',
       'Karnataka', 'Kerala', 'Ladakh', 'Madhya Pradesh', 'Maharashtra',
       'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha',
       'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu',
       'Telengana', 'Tripura', 'Uttarakhand', 'Uttar Pradesh',
       'West Bengal'], dtype=object)

In [10]:
# number of missing values 
df_bs.isna().sum()

Name of State / UT            0
Active Cases*                 0
Cured/Discharged/Migrated*    0
Deaths**                      0
Total Confirmed cases*        0
Date                          0
Latitude                      0
Longitude                     0
dtype: int64

In [11]:
# number of unique values 
df_bs.nunique()

Name of State / UT            35
Active Cases*                 35
Cured/Discharged/Migrated*    34
Deaths**                      23
Total Confirmed cases*        35
Date                           1
Latitude                      35
Longitude                     31
dtype: int64

# Saving data

In [12]:
# saving data
# ===========

# file names as year-month-day.csv format
file_name = now.strftime("%Y_%m_%d")+'.csv'

# location for saving the file
file_loc = 'C:\\Users\\imdevskp\\Documents\\github\\covid_india\\.day_by_day_data\\'

# save file as a scv file
df_bs.to_csv(file_loc + file_name, index=False)

df_bs.head(36)

Unnamed: 0,Name of State / UT,Active Cases*,Cured/Discharged/Migrated*,Deaths**,Total Confirmed cases*,Date,Latitude,Longitude
0,Andaman and Nicobar Islands,0,33,0,33,2020-06-08,11.7401,92.6586
1,Andhra Pradesh,1951,2682,75,4708,2020-06-08,15.9129,79.74
2,Arunachal Pradesh,50,1,0,51,2020-06-08,28.218,94.7278
3,Assam,1946,615,4,2565,2020-06-08,26.2006,92.9376
4,Bihar,2578,2480,30,5088,2020-06-08,25.0961,85.3131
5,Chandigarh,36,273,5,314,2020-06-08,30.7333,76.7794
6,Chhattisgarh,786,283,4,1073,2020-06-08,21.2787,81.8661
7,Dadar Nagar Haveli,18,2,0,20,2020-06-08,20.1809,73.0169
8,Delhi,17125,10999,812,28936,2020-06-08,28.7041,77.1025
9,Goa,235,65,0,300,2020-06-08,15.2993,74.124


In [13]:
# column names 
# df_bs.columns

# Combining data

In [14]:
# list of all files available
# ! ls C:\Users\imdevskp\Documents\github\covid_india\.day_by_day_data

In [15]:
# location of the file
loc = "C:\\Users\\imdevskp\\Documents\\github\\covid_india\\.day_by_day_data\\"

# list of all files
files = glob.glob(loc+'2020*.csv')
   
# container for each day's data's dataframe
dfs = []

# loop through the files and append to the dfs list
for i in files:
    # read data
    df_temp = pd.read_csv(i)
    
    # rename columns
    
    try:
        df_temp = df_temp.drop(['Total Confirmed cases (Indian National)', 
                                'Total Confirmed cases ( Foreign National )'], axis=1)
    except:
        pass
        
    d = {'^Cured.*': 'Cured/Discharged/Migrated', 
         'Total Confirmed cases.*': 'Total Confirmed cases', 
         'Death.*': 'Death'}
    
    df_temp.columns = df_temp.columns.to_series().replace(d, regex=True)


#     df_temp = df_temp.rename(columns={'Cured':'Cured/Discharged'})
#     df_temp = df_temp.rename(columns={'Cured/Discharged':'Cured/Discharged/Migrated', 
#                                       'Total Confirmed cases *': 'Total Confirmed cases', 
#                                       'Total Confirmed cases ': 'Total Confirmed cases', 
#                                       'Total Confirmed cases* ': 'Total Confirmed cases'})
#     df_temp = df_temp.rename(columns=lambda x: re.sub('Total Confirmed cases \(Including .* foreign Nationals\) ',
#                                                       'Total Confirmed cases',x))
#     df_temp = df_temp.rename(columns=lambda x: re.sub("Death.*", "Death", x))

    
    # append to the df_s
    dfs.append(df_temp)
    
# print(dfs)

# concat dataframes
complete_data = pd.concat(dfs, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)


complete_data['Death'] = complete_data['Death'].astype('str').str.extract('(\d+)').astype('int')

complete_data.sample(10)

Unnamed: 0,Date,Name of State / UT,Cured/Discharged/Migrated,Latitude,Longitude,Death,Total Confirmed cases,Active Cases*
2709,5/4/2020,Ladakh,17,34.2996,78.2932,0,41,
2767,6/1/2020,Uttar Pradesh,4709,26.8467,80.9462,213,7823,3015.0
927,2020-05-01,Haryana,209,29.0588,76.0856,3,313,
1175,2020-05-12,Odisha,85,20.9517,85.0985,3,414,
1933,3/23/2020,Madhya Pradesh,0,22.9734,78.6569,0,6,
869,2020-04-29,Jharkhand,17,23.6102,85.2799,3,103,
392,2020-04-12,Tamil Nadu,44,11.1271,78.6569,10,969,
255,2020-04-07,Chhattisgarh,8,21.2787,81.8661,0,10,
352,2020-04-10,Andaman and Nicobar Islands,0,11.7401,92.6586,0,11,
2292,5/18/2020,West Bengal,959,22.9868,87.855,238,2677,


In [16]:
# complete_data.columns

## Preprocessing

In [17]:
# fix datatype
complete_data['Date'] = pd.to_datetime(complete_data['Date'])

# sort rows
complete_data = complete_data.sort_values(['Date', 'Name of State / UT']).reset_index(drop=True)

# fill missing values with 0
cols = ['Cured/Discharged/Migrated', 'Death']
complete_data[cols] = complete_data[cols].fillna(0).astype('int')

In [18]:
# rename state/UT names
complete_data['Name of State / UT'].replace('Chattisgarh', 'Chhattisgarh', inplace=True)
complete_data['Name of State / UT'].replace('Pondicherry', 'Puducherry', inplace=True) 

In [19]:
# select only rows with more than 1 case
complete_data = complete_data[complete_data['Total Confirmed cases']>0]

In [20]:
# drop extra columns
complete_data = complete_data.drop(['Active Cases*'], axis=1)

In [21]:
# rearrange columns
complete_data = complete_data[['Date', 'Name of State / UT', 'Latitude', 'Longitude', 
                               'Total Confirmed cases', 'Death', 'Cured/Discharged/Migrated']]

## Final dataframe

In [22]:
# random rows
complete_data.sample(3)

Unnamed: 0,Date,Name of State / UT,Latitude,Longitude,Total Confirmed cases,Death,Cured/Discharged/Migrated
1132,2020-04-20,Kerala,10.8505,76.2711,402,3,270
1587,2020-05-04,Meghalaya,25.467,91.3662,12,1,0
43,2020-03-04,Telengana,18.1124,79.0193,1,0,0


In [23]:
# complete data info
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2775 entries, 0 to 2782
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       2775 non-null   datetime64[ns]
 1   Name of State / UT         2775 non-null   object        
 2   Latitude                   2775 non-null   float64       
 3   Longitude                  2775 non-null   float64       
 4   Total Confirmed cases      2775 non-null   int64         
 5   Death                      2775 non-null   int32         
 6   Cured/Discharged/Migrated  2775 non-null   int32         
dtypes: datetime64[ns](1), float64(2), int32(2), int64(1), object(1)
memory usage: 151.8+ KB


## Save as .csv file

In [24]:
# save data in a csv file
complete_data.to_csv('complete.csv', index=False)

In [25]:
# complete_data.groupby('Date').count()

In [26]:
# complete_data.sort_values('Death', ascending=False)

In [27]:
complete_data[complete_data['Date']==max(complete_data['Date'])]

Unnamed: 0,Date,Name of State / UT,Latitude,Longitude,Total Confirmed cases,Death,Cured/Discharged/Migrated
2748,2020-06-08,Andaman and Nicobar Islands,11.7401,92.6586,33,0,33
2749,2020-06-08,Andhra Pradesh,15.9129,79.74,4708,75,2682
2750,2020-06-08,Arunachal Pradesh,28.218,94.7278,51,0,1
2751,2020-06-08,Assam,26.2006,92.9376,2565,4,615
2752,2020-06-08,Bihar,25.0961,85.3131,5088,30,2480
2753,2020-06-08,Chandigarh,30.7333,76.7794,314,5,273
2754,2020-06-08,Chhattisgarh,21.2787,81.8661,1073,4,283
2755,2020-06-08,Dadar Nagar Haveli,20.1809,73.0169,20,0,2
2756,2020-06-08,Delhi,28.7041,77.1025,28936,812,10999
2757,2020-06-08,Goa,15.2993,74.124,300,0,65
