In [113]:
#Importing packages
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
from selenium import webdriver
from datetime import datetime

### step 1 ===> Data Gathering / Data Extracting 
####  <ul > <li> Let’s now create a new instance of google chrome.</li> <li> This will help our program open an url in google chrome & dynamically Scrap our data (The reason we are using selenium) </li></ul>

In [114]:
driver = webdriver.Chrome(executable_path = r"c:/Users/mohan/OneDrive/Desktop/projects/chromedriver_win32/chromedriver.exe")
url = 'https://www.mohfw.gov.in/'
driver.get(url) #using selenium for webpage table content because it is dynamically loads after page loads because of javascript(DOM)
webContent = driver.page_source
PySoup = BeautifulSoup(webContent,'html')
driver.quit()

### we have data in "Pysoup" now Scraping data that we need (date, headings , table )
#### ========== >> Getting Date on which we are Scraping The COVID-19 Data << ==============================

In [115]:
div_data = PySoup.find_all('div',{'class' : "data-table table-responsive"})
get_date = div_data[0].h5.span.text.split(":")[1].split(",")[0]
format = ' %d %B %Y' # The format we want to respresent our date 
datetime_obj = datetime.strptime(get_date, format).date()

# or if you want Today's date from datetime module, you can use this
# datetime_obj  = datetime.today.date()

#### ========== >> Getting Headings on which we are Scraping The COVID-19 Data << ==============================

In [116]:
#Scraping headings ==>
get_head = PySoup.find_all('thead')[0].find('tr',{'class' : "row1"})
headings = [th.get_text().strip() for th in get_head.find_all("th")]
headings.insert(3,'Change Active Case Since Yesterday')
headings.insert(5,'Change Cured Case Since Yesterday')
headings.insert(7,'Change Death Case Since Yesterday')

#### ============= >> Getting Table from the tbody tag  on which we have our COVID-19 Data State-wise << ====================

In [117]:
get_tbody = PySoup.find_all('tbody')[0]
#Extracting Table Data into "mytabledata" []==>
mytabledata = []
list_of_tr_rows = get_tbody.find_all("tr")
for tr_row in list_of_tr_rows:
    if isinstance(tr_row,NavigableString):
        continue
    if isinstance(tr_row, Tag):
        statedata_singlerow = [-int(td.get_text().strip()) \
                               if (td.span != None) and (td.span.get('class',[0])[0] == "down") \
                               else td.get_text().strip() \
                               for td in tr_row.find_all("td")
                              ]
        data = dict(zip( headings,(statedata_singlerow) ))  
        if 'Active Cases*' in data:
            mytabledata.append(data)

### Step 2 == > Data Transformation / Munging / wragling
#### ========== >> Coverting "mytabledata Into" our DataFrame & doing Data Cleaning << ======================================

In [118]:
state_data = pd.DataFrame(mytabledata, columns=headings)
new_cols = ["Sr.No","States/UT","Active Cases","Active Cases Since Yesterday",\
            "Recovered","Recovered Cases Since Yesterday","Deceased","Deceased Cases Since Yesterday"]
state_data.columns = new_cols
state_data = state_data.set_index('Sr.No')
state_data.loc['Total#']['Deceased'] = state_data.loc['Total#']['Recovered']
state_data.loc['Total#']['Recovered'] = state_data.loc['Total#']['Active Cases']
state_data.loc['Total#']['Active Cases'] = state_data.loc['Total#']['States/UT']
# state_data.loc['Total#']['States/UT'] =  ""
state_data['Date'] = datetime_obj.strftime("%d-%m-%Y")
# df_new = state_data.rename(columns={'A': 'a'}, index={'ONE': 'one'})
state_data.rename(index={'Total#': 'Total'},inplace=True)
# state_data

### Step 3  ==> Data cleaning

In [119]:
state_data.replace(['',np.nan], 0,inplace=True)
cols_to_include =[i for i in state_data.columns if i not in ['States/UT','Date']]
state_data[cols] = state_data[cols_to_include].astype('int')
state_data['States/UT'] = state_data['States/UT'].astype('str')
state_data['Deceased Cases Since Yesterday'] = state_data['Deceased Cases Since Yesterday'].abs()
state_data

Unnamed: 0_level_0,States/UT,Active Cases,Active Cases Since Yesterday,Recovered,Recovered Cases Since Yesterday,Deceased,Deceased Cases Since Yesterday,Date
Sr.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Andaman and Nicobar Islands,484,81,242,16,8,1,03-08-2020
2,Andhra Pradesh,74404,2216,82886,6272,1474,67,03-08-2020
3,Arunachal Pradesh,699,-2,996,27,3,0,03-08-2020
4,Assam,10415,232,32384,942,105,4,03-08-2020
5,Bihar,20306,1369,36389,1395,329,20,03-08-2020
6,Chandigarh,400,22,698,15,19,1,03-08-2020
7,Chhattisgarh,2482,-238,6991,381,58,3,03-08-2020
8,Dadra and Nagar Haveli and Daman and Diu,416,-2,766,41,2,0,03-08-2020
9,Delhi,10356,-240,123317,1186,4004,15,03-08-2020
10,Goa,1809,102,4668,230,53,5,03-08-2020


### Step 4 ==> Data Exporting  
#### Saving Dataframe (state_data) into Excel Sheet or CSV file

In [120]:
from datetime import datetime
def custom_strftime(format,date_obj):
    suffix = (lambda X:'th' if 11<=X<=13 else {1:'st',2:'nd',3:'rd'}.get(X%10, 'th'))(int(date_obj.day))
    return date_obj.strftime(format).replace('{S}', str(date_obj.day) + str(suffix))
# datetime_obj  = datetime.now()
file_name = "COVID19_{}".format(custom_strftime('{S}%b',datetime_obj))

state_data.to_excel( r'c:/Users/mohan/OneDrive/Desktop/projects/{}.xlsx'.format(file_name),sheet_name = 'COVID19 State Data')
state_data.to_csv( r'c:/Users/mohan/OneDrive/Desktop/projects/{}.csv'.format(file_name))
state_data

Unnamed: 0_level_0,States/UT,Active Cases,Active Cases Since Yesterday,Recovered,Recovered Cases Since Yesterday,Deceased,Deceased Cases Since Yesterday,Date
Sr.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Andaman and Nicobar Islands,484,81,242,16,8,1,03-08-2020
2,Andhra Pradesh,74404,2216,82886,6272,1474,67,03-08-2020
3,Arunachal Pradesh,699,-2,996,27,3,0,03-08-2020
4,Assam,10415,232,32384,942,105,4,03-08-2020
5,Bihar,20306,1369,36389,1395,329,20,03-08-2020
6,Chandigarh,400,22,698,15,19,1,03-08-2020
7,Chhattisgarh,2482,-238,6991,381,58,3,03-08-2020
8,Dadra and Nagar Haveli and Daman and Diu,416,-2,766,41,2,0,03-08-2020
9,Delhi,10356,-240,123317,1186,4004,15,03-08-2020
10,Goa,1809,102,4668,230,53,5,03-08-2020
