In [14]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import pandas as pd
from bs4 import BeautifulSoup

# Web Scrapping

In [15]:
# web scrapping

link = 'https://www.mohfw.gov.in/'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

table = soup.find_all('table')[0]
rows = table.find_all('tr')

# print(rows)

row_list = []

for tr in rows:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    row_list.append(row)
    
# print(table)
    
df_bs = pd.DataFrame(row_list[1:len(row_list)-1], columns=row_list[0])
    
df_bs.drop('S. No.', axis=1, inplace=True)
df_bs.head(20)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death
0,Andhra Pradesh,2,0,0,0
1,Chattisgarh,1,0,0,0
2,Delhi,11,1,3,1
3,Haryana,3,14,0,0
4,Karnataka,14,0,0,1
5,Kerala,25,2,3,0
6,Maharashtra,44,3,0,1
7,Odisha,1,0,0,0
8,Pondicherry,1,0,0,0
9,Punjab,2,0,0,1


# Data Cleaning

In [16]:
# date-time information
# ---------------------

now  = datetime.now()
df_bs['Date'] = now.strftime("%m/%d/%Y") 
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')
df_bs.head(20)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death,Date
0,Andhra Pradesh,2,0,0,0,2020-03-19
1,Chattisgarh,1,0,0,0,2020-03-19
2,Delhi,11,1,3,1,2020-03-19
3,Haryana,3,14,0,0,2020-03-19
4,Karnataka,14,0,0,1,2020-03-19
5,Kerala,25,2,3,0,2020-03-19
6,Maharashtra,44,3,0,1,2020-03-19
7,Odisha,1,0,0,0,2020-03-19
8,Pondicherry,1,0,0,0,2020-03-19
9,Punjab,2,0,0,1,2020-03-19


In [17]:
df_bs['Name of State / UT'].unique()

array(['Andhra Pradesh', 'Chattisgarh', 'Delhi', 'Haryana', 'Karnataka',
       'Kerala', 'Maharashtra', 'Odisha', 'Pondicherry', 'Punjab',
       'Rajasthan', 'Tamil Nadu', 'Telengana',
       'Union Territory of Chandigarh',
       'Union Territory of Jammu and Kashmir',
       'Union Territory of Ladakh', 'Uttar Pradesh', 'Uttarakhand',
       'West Bengal'], dtype=object)

In [18]:
# latitude and longitude information
# ----------------------------------

lat = {'Delhi':28.7041,
       'Haryana':29.0588,
       'Kerala':10.8505,
       'Rajasthan':27.0238,
       'Telengana':18.1124,
       'Uttar Pradesh':26.8467,
       'Union Territory of Ladakh':34.2996,
       'Tamil Nadu':11.1271,
       'Union Territory of Jammu and Kashmir':33.7782,
       'Punjab':31.1471,
       'Karnataka':15.3173,
       'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129, 
       'Odisha':20.9517, 
       'Uttarakhand':30.0668, 
       'West Bengal':22.9868, 
       'Pondicherry': 11.9416, 
       'Union Territory of Chandigarh': 30.7333, 
       'Chattisgarh':21.2787}

long = {'Delhi':77.1025,
        'Haryana':76.0856,
        'Kerala':76.2711,
        'Rajasthan':74.2179,
        'Telengana':79.0193,
        'Uttar Pradesh':80.9462,
        'Union Territory of Ladakh':78.2932,
        'Tamil Nadu':78.6569,
        'Union Territory of Jammu and Kashmir':76.5762,
        'Punjab':75.3412,
        'Karnataka':75.7139,
        'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400, 
        'Odisha':85.0985, 
        'Uttarakhand':79.0193, 
        'West Bengal':87.8550, 
        'Pondicherry': 79.8083, 
        'Union Territory of Chandigarh': 76.7794, 
        'Chattisgarh':81.8661}

df_bs['Latitude'] = df_bs['Name of State / UT'].map(lat)
df_bs['Longitude'] = df_bs['Name of State / UT'].map(long)

df_bs.head()

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andhra Pradesh,2,0,0,0,2020-03-19,15.9129,79.74
1,Chattisgarh,1,0,0,0,2020-03-19,21.2787,81.8661
2,Delhi,11,1,3,1,2020-03-19,28.7041,77.1025
3,Haryana,3,14,0,0,2020-03-19,29.0588,76.0856
4,Karnataka,14,0,0,1,2020-03-19,15.3173,75.7139


In [19]:
df_bs.isna().sum()

Name of State / UT                            0
Total Confirmed cases (Indian National)       0
Total Confirmed cases ( Foreign National )    0
Cured/Discharged/Migrated                     0
Death                                         0
Date                                          0
Latitude                                      0
Longitude                                     0
dtype: int64

# Saving data

In [20]:
# saving data
# -----------

file_name = now.strftime("%Y_%m_%d")+'.csv'
file_loc = 'C:\\Users\\imdevskp\\Desktop\\covid_india\\.day_by_day_data\\'
df_bs.to_csv(file_loc + file_name, index=False)

df_bs.head(20)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andhra Pradesh,2,0,0,0,2020-03-19,15.9129,79.74
1,Chattisgarh,1,0,0,0,2020-03-19,21.2787,81.8661
2,Delhi,11,1,3,1,2020-03-19,28.7041,77.1025
3,Haryana,3,14,0,0,2020-03-19,29.0588,76.0856
4,Karnataka,14,0,0,1,2020-03-19,15.3173,75.7139
5,Kerala,25,2,3,0,2020-03-19,10.8505,76.2711
6,Maharashtra,44,3,0,1,2020-03-19,19.7515,75.7139
7,Odisha,1,0,0,0,2020-03-19,20.9517,85.0985
8,Pondicherry,1,0,0,0,2020-03-19,11.9416,79.8083
9,Punjab,2,0,0,1,2020-03-19,31.1471,75.3412


In [21]:
df_bs.columns

Index(['Name of State / UT', 'Total Confirmed cases (Indian National)',
       'Total Confirmed cases ( Foreign National )',
       'Cured/Discharged/Migrated', 'Death', 'Date', 'Latitude', 'Longitude'],
      dtype='object')

# Combining data

In [22]:
! ls C:\Users\imdevskp\Desktop\covid_india\.day_by_day_data

2020_03_10.csv
2020_03_11.csv
2020_03_12.csv
2020_03_13.csv
2020_03_14.csv
2020_03_15.csv
2020_03_16.csv
2020_03_17.csv
2020_03_18.csv
2020_03_19.csv


In [23]:
# pd.read_csv?

In [24]:
# complete data

loc = "C:\\Users\\imdevskp\\Desktop\\covid_india\\.day_by_day_data\\"

files = glob.glob(loc+'2020*.csv')
dfs = []
for i in files:
    df_temp = pd.read_csv(i)
    df_temp = df_temp.rename(columns={'Cured':'Cured/Discharged'})
    df_temp = df_temp.rename(columns={'Cured/Discharged':'Cured/Discharged/Migrated'})
    dfs.append(df_temp)
    
complete_data = pd.concat(dfs, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)
complete_data['Date'] = pd.to_datetime(complete_data['Date'])
complete_data = complete_data.sort_values(['Date', 'Name of State / UT']).reset_index(drop=True)

cols = ['Total Confirmed cases (Indian National)', 'Total Confirmed cases ( Foreign National )', 
              'Cured/Discharged/Migrated', 'Death']

complete_data[cols] = complete_data[cols].fillna(0).astype('int')

complete_data


complete_data.to_csv('complete.csv', index=False)
complete_data.head()

Unnamed: 0,Date,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Latitude,Longitude,Death
0,2020-01-30,Kerala,1,0,0,10.8505,76.2711,0
1,2020-01-31,Kerala,1,0,0,10.8505,76.2711,0
2,2020-02-01,Kerala,2,0,0,10.8505,76.2711,0
3,2020-02-02,Kerala,3,0,0,10.8505,76.2711,0
4,2020-02-03,Kerala,3,0,0,10.8505,76.2711,0


In [25]:
complete_data.columns

Index(['Date', 'Name of State / UT', 'Total Confirmed cases (Indian National)',
       'Total Confirmed cases ( Foreign National )',
       'Cured/Discharged/Migrated', 'Latitude', 'Longitude', 'Death'],
      dtype='object')

In [26]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228 entries, 0 to 227
Data columns (total 8 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   Date                                        228 non-null    datetime64[ns]
 1   Name of State / UT                          228 non-null    object        
 2   Total Confirmed cases (Indian National)     228 non-null    int32         
 3   Total Confirmed cases ( Foreign National )  228 non-null    int32         
 4   Cured/Discharged/Migrated                   228 non-null    int32         
 5   Latitude                                    228 non-null    float64       
 6   Longitude                                   228 non-null    float64       
 7   Death                                       228 non-null    int32         
dtypes: datetime64[ns](1), float64(2), int32(4), object(1)
memory usage: 10.8+ KB
