# Libraries

In [55]:
# import libraries
# ================

# for date and time opeations
from datetime import datetime
# for file and folder operations
import os
# for regular expression opeations
import re
# for listing files in a folder
import glob
# for getting web contents
import requests 
# storing and analysing data
import pandas as pd
# for scraping web contents
from bs4 import BeautifulSoup

# Web Scrapping

In [73]:
# get data
# ========

# link at which web data recides
link = 'https://www.mohfw.gov.in/'
# get web data
req = requests.get(link)
# parse web data
soup = BeautifulSoup(req.content, "html.parser")

In [74]:
# find the table
# ==============
# our target table is the last table in the page

# get the table head
# table head may contain the column names, titles, subtitles
thead = soup.find_all('thead')[-1]
# print(thead)

# get all the rows in table head
# it usually have only one row, which has the column names
head = thead.find_all('tr')
# print(head)

# get the table tbody
# it contains the contents
tbody = soup.find_all('tbody')[-1]
# print(tbody)

# get all the rows in table body
# each row is each state's entry
body = tbody.find_all('tr')
# print(body)

In [75]:
# get the web contents
# ====================

# container for header rows / column title
head_rows = []
# container for table body / contents
body_rows = []

# loop through the head and append each row to head
for tr in head:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    head_rows.append(row)
# print(head_rows)

# loop through the body and append each row to body
for tr in body:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    body_rows.append(row)
# print(head_rows)

In [None]:
# save contents in a dataframe
# ============================
    
# skip last 3 rows, it contains unwanted info
# head_rows contains column title
df_bs = pd.DataFrame(body_rows[:len(body_rows)-3], 
                     columns=head_rows[0])         

# Drop 'S. No.' column
df_bs.drop('S. No.', axis=1, inplace=True)

# there are 36 states+UT in India
df_bs.head(36)

# Data Cleaning

In [57]:
# date-time information
# ---------------------

now  = datetime.now()
df_bs['Date'] = now.strftime("%m/%d/%Y") 
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')
df_bs['Name of State / UT'] = df_bs['Name of State / UT'].str.replace('#', '')
df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 111 foreign Nationals),Cured/Discharged/Migrated,Death,Date
0,Andaman and Nicobar Islands,27,11,0,2020-04-25
1,Andhra Pradesh,955,145,29,2020-04-25
2,Arunachal Pradesh,1,1,0,2020-04-25
3,Assam,36,19,1,2020-04-25
4,Bihar,223,46,2,2020-04-25
5,Chandigarh,27,14,0,2020-04-25
6,Chhattisgarh,36,30,0,2020-04-25
7,Delhi,2514,857,53,2020-04-25
8,Goa,7,7,0,2020-04-25
9,Gujarat,2815,265,127,2020-04-25


In [58]:
df_bs['Name of State / UT'].unique()

array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Delhi', 'Goa', 'Gujarat', 'Haryana',
       'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka',
       'Kerala', 'Ladakh', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Meghalaya', 'Mizoram', 'Odisha', 'Puducherry', 'Punjab',
       'Rajasthan', 'Tamil Nadu', 'Telengana', 'Tripura', 'Uttarakhand',
       'Uttar Pradesh', 'West Bengal'], dtype=object)

In [59]:
# latitude and longitude information
# ----------------------------------

lat = {'Delhi':28.7041, 'Haryana':29.0588, 'Kerala':10.8505, 'Rajasthan':27.0238,
       'Telengana':18.1124, 'Uttar Pradesh':26.8467, 'Ladakh':34.2996, 'Tamil Nadu':11.1271,
       'Jammu and Kashmir':33.7782, 'Punjab':31.1471, 'Karnataka':15.3173, 'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129, 'Odisha':20.9517, 'Uttarakhand':30.0668, 'West Bengal':22.9868, 
       'Puducherry': 11.9416, 'Chandigarh': 30.7333, 'Chhattisgarh':21.2787, 'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 'Madhya Pradesh': 22.9734, 'Bihar': 25.0961, 'Manipur':24.6637, 
       'Mizoram':23.1645, 'Goa': 15.2993, 'Andaman and Nicobar Islands': 11.7401, 'Assam' : 26.2006, 
       'Jharkhand': 23.6102, 'Arunachal Pradesh': 28.2180, 'Tripura': 23.9408, 'Nagaland': 26.1584, 
       'Meghalaya' : 25.4670}

long = {'Delhi':77.1025, 'Haryana':76.0856, 'Kerala':76.2711, 'Rajasthan':74.2179,
        'Telengana':79.0193, 'Uttar Pradesh':80.9462, 'Ladakh':78.2932, 'Tamil Nadu':78.6569,
        'Jammu and Kashmir':76.5762, 'Punjab':75.3412, 'Karnataka':75.7139, 'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400, 'Odisha':85.0985, 'Uttarakhand':79.0193, 'West Bengal':87.8550, 
        'Puducherry': 79.8083, 'Chandigarh': 76.7794, 'Chhattisgarh':81.8661, 'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 'Madhya Pradesh': 78.6569, 'Bihar': 85.3131, 'Manipur':93.9063, 
        'Mizoram':92.9376, 'Goa': 74.1240, 'Andaman and Nicobar Islands': 92.6586, 'Assam' : 92.9376, 
        'Jharkhand': 85.2799, 'Arunachal Pradesh': 94.7278, 'Tripura': 91.9882, 'Nagaland': 94.5624,
        'Meghalaya' : 91.3662}

df_bs['Latitude'] = df_bs['Name of State / UT'].map(lat)
df_bs['Longitude'] = df_bs['Name of State / UT'].map(long)

df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 111 foreign Nationals),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andaman and Nicobar Islands,27,11,0,2020-04-25,11.7401,92.6586
1,Andhra Pradesh,955,145,29,2020-04-25,15.9129,79.74
2,Arunachal Pradesh,1,1,0,2020-04-25,28.218,94.7278
3,Assam,36,19,1,2020-04-25,26.2006,92.9376
4,Bihar,223,46,2,2020-04-25,25.0961,85.3131
5,Chandigarh,27,14,0,2020-04-25,30.7333,76.7794
6,Chhattisgarh,36,30,0,2020-04-25,21.2787,81.8661
7,Delhi,2514,857,53,2020-04-25,28.7041,77.1025
8,Goa,7,7,0,2020-04-25,15.2993,74.124
9,Gujarat,2815,265,127,2020-04-25,22.2587,71.1924


In [60]:
df_bs.isna().sum()

Name of State / UT                                          0
Total Confirmed cases (Including 111 foreign Nationals)     0
Cured/Discharged/Migrated                                   0
Death                                                       0
Date                                                        0
Latitude                                                    0
Longitude                                                   0
dtype: int64

# Saving data

In [61]:
# saving data
# -----------

file_name = now.strftime("%Y_%m_%d")+'.csv'
file_loc = 'C:\\Users\\imdevskp\\Documents\\github\\covid_india\\.day_by_day_data\\'
df_bs.to_csv(file_loc + file_name, index=False)

df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 111 foreign Nationals),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andaman and Nicobar Islands,27,11,0,2020-04-25,11.7401,92.6586
1,Andhra Pradesh,955,145,29,2020-04-25,15.9129,79.74
2,Arunachal Pradesh,1,1,0,2020-04-25,28.218,94.7278
3,Assam,36,19,1,2020-04-25,26.2006,92.9376
4,Bihar,223,46,2,2020-04-25,25.0961,85.3131
5,Chandigarh,27,14,0,2020-04-25,30.7333,76.7794
6,Chhattisgarh,36,30,0,2020-04-25,21.2787,81.8661
7,Delhi,2514,857,53,2020-04-25,28.7041,77.1025
8,Goa,7,7,0,2020-04-25,15.2993,74.124
9,Gujarat,2815,265,127,2020-04-25,22.2587,71.1924


In [62]:
df_bs.columns

Index(['Name of State / UT',
       'Total Confirmed cases (Including 111 foreign Nationals) ',
       'Cured/Discharged/Migrated', 'Death', 'Date', 'Latitude', 'Longitude'],
      dtype='object')

# Combining data

In [63]:
! ls C:\Users\imdevskp\Documents\github\covid_india\.day_by_day_data

2020_03_21.csv
2020_03_22.csv
2020_03_23.csv
2020_03_24.csv
2020_03_25.csv
2020_03_26.csv
2020_03_27.csv
2020_03_28.csv
2020_03_29.csv
2020_03_30.csv
2020_03_31.csv
2020_04_01.csv
2020_04_02.csv
2020_04_03.csv
2020_04_04.csv
2020_04_05.csv
2020_04_06.csv
2020_04_07.csv
2020_04_08.csv
2020_04_09.csv
2020_04_10.csv
2020_04_11.csv
2020_04_12.csv
2020_04_13.csv
2020_04_14.csv
2020_04_15.csv
2020_04_16.csv
2020_04_17.csv
2020_04_18.csv
2020_04_19.csv
2020_04_20.csv
2020_04_21.csv
2020_04_22.csv
2020_04_23.csv
2020_04_24.csv
2020_04_25.csv


In [64]:
# pd.read_csv?

In [65]:
# complete data

loc = "C:\\Users\\imdevskp\\Documents\\github\\covid_india\\.day_by_day_data\\"

files = glob.glob(loc+'2020*.csv')
   
dfs = []
for i in files:
    df_temp = pd.read_csv(i)
    df_temp = df_temp.rename(columns={'Cured':'Cured/Discharged'})
    df_temp = df_temp.rename(columns={'Cured/Discharged':'Cured/Discharged/Migrated', 
                                      'Total Confirmed cases *': 'Total Confirmed cases', 
                                      'Total Confirmed cases ': 'Total Confirmed cases'})
    df_temp = df_temp.rename(columns=lambda x: re.sub('Total Confirmed cases \(Including .* foreign Nationals\) ',
                                                      'Total Confirmed cases',x))
    dfs.append(df_temp)
    
# print(dfs)

complete_data = pd.concat(dfs, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)
complete_data['Date'] = pd.to_datetime(complete_data['Date'])
complete_data = complete_data.sort_values(['Date', 'Name of State / UT']).reset_index(drop=True)

cols = ['Total Confirmed cases (Indian National)', 'Total Confirmed cases ( Foreign National )', 
              'Cured/Discharged/Migrated', 'Death']

# complete_data['Death'] = complete_data['Death'].str.extract('(\d+)')
complete_data[cols] = complete_data[cols].fillna(0).astype('int')

# complete_data.tail(50)

In [66]:
complete_data.columns

Index(['Date', 'Name of State / UT', 'Total Confirmed cases (Indian National)',
       'Total Confirmed cases ( Foreign National )',
       'Cured/Discharged/Migrated', 'Latitude', 'Longitude', 'Death',
       'Total Confirmed cases'],
      dtype='object')

In [67]:
complete_data['Name of State / UT'].replace('Chattisgarh', 'Chhattisgarh', inplace=True)
complete_data['Name of State / UT'].replace('Pondicherry', 'Puducherry', inplace=True) 

In [68]:
complete_data['Name of State / UT'].unique()

array(['Kerala', 'Delhi', 'Telengana', 'Rajasthan', 'Haryana',
       'Uttar Pradesh', 'Tamil Nadu', 'Union Territory of Ladakh',
       'Karnataka', 'Maharashtra', 'Punjab',
       'Union Territory of Jammu and Kashmir', 'Andhra Pradesh',
       'Uttarakhand', 'Odisha', 'Puducherry', 'West Bengal',
       'Chhattisgarh', 'Union Territory of Chandigarh', 'Gujarat',
       'Chandigarh', 'Himachal Pradesh', 'Jammu and Kashmir', 'Ladakh',
       'Madhya Pradesh', 'Bihar', 'Manipur', 'Mizoram',
       'Andaman and Nicobar Islands', 'Goa', 'Assam', 'Jharkhand',
       'Arunachal Pradesh', 'Tripura', 'Meghalaya', 'Nagaland'],
      dtype=object)

In [69]:
complete_data.tail()

Unnamed: 0,Date,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Latitude,Longitude,Death,Total Confirmed cases
1306,2020-04-25,Telengana,0,0,253,18.1124,79.0193,26,984
1307,2020-04-25,Tripura,0,0,1,23.9408,91.9882,0,2
1308,2020-04-25,Uttar Pradesh,0,0,247,26.8467,80.9462,25,1621
1309,2020-04-25,Uttarakhand,0,0,25,30.0668,79.0193,0,48
1310,2020-04-25,West Bengal,0,0,103,22.9868,87.855,18,571


In [70]:
# sorted(complete_data['Name of State / UT'].unique())

In [71]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 9 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   Date                                        1311 non-null   datetime64[ns]
 1   Name of State / UT                          1311 non-null   object        
 2   Total Confirmed cases (Indian National)     1311 non-null   int32         
 3   Total Confirmed cases ( Foreign National )  1311 non-null   int32         
 4   Cured/Discharged/Migrated                   1311 non-null   int32         
 5   Latitude                                    1311 non-null   float64       
 6   Longitude                                   1311 non-null   float64       
 7   Death                                       1311 non-null   int32         
 8   Total Confirmed cases                       1311 non-null   int64         
dtypes: dateti

In [72]:
complete_data.to_csv('complete.csv', index=False)