In [11]:
#Importing packages
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
from selenium import webdriver
from datetime import datetime,date

### step 1 ===> Data Gathering / Data Extracting 
####  <ul > <li> Let’s now create a new instance of google chrome.</li> <li> This will help our program open an url in google chrome & dynamically Scrap our data (The reason we are using selenium) </li></ul>

In [12]:
driver = webdriver.Chrome(executable_path = r"c:/Users/mohan/OneDrive/Desktop/projects/chromedriver_win32/chromedriver.exe")
url = 'https://www.mohfw.gov.in/'
driver.get(url) #using selenium for webpage table content because it is dynamically loads after page loads because of javascript(DOM)
webContent = driver.page_source
PySoup = BeautifulSoup(webContent,'html')
driver.quit()

### we have data in "Pysoup" now Scraping data that we need (date, headings , table )
#### ========== >> Getting Date on which we are Scraping The COVID-19 Data << ==============================

In [13]:
div_data = PySoup.find_all('div',{'class' : "data-table table-responsive"})
get_date = div_data[0].h5.span.text.split(":")[1].split(",")[0]
format = ' %d %B %Y' # The format we want to respresent our date 
datetime_obj = datetime.strptime(get_date, format).date()
print(type(datetime_obj))

# or if you want Today's date from datetime module, you can use this
# datetime_obj  = datetime.today.date()

<class 'datetime.date'>


#### ============= >> Getting Headings on which we are Scraping The COVID-19 Data << ==============================

In [14]:
#Scraping headings ==>
get_head = PySoup.find_all('thead')[0].find('tr',{'class' : "row1"})
headings = [th.get_text().strip() for th in get_head.find_all("th")]
headings.insert(3,'Change Active Case Since Yesterday')
headings.insert(5,'Change Cured Case Since Yesterday')
headings.insert(7,'Change Death Case Since Yesterday')

#### ============= >> Getting Table from the tbody tag  on which we have our COVID-19 Data State-wise << ====================

In [15]:
get_tbody = PySoup.find_all('tbody')[0]      #find_all returns list of tags that contains data
#Extracting Table Data into "mytabledata" [] ==>
mytabledata = []
list_of_tr_rows = get_tbody.find_all("tr")
for tr_row in list_of_tr_rows:
    if isinstance(tr_row,NavigableString):
        continue
    if isinstance(tr_row, Tag):
        statedata_singlerow = [-int(td.get_text().strip()) \
                               if (td.span != None) and (td.span.get('class',[0])[0] == "down") \
                               else td.get_text().strip() \
                               for td in tr_row.find_all("td")
                              ]
        data = dict(zip( headings,(statedata_singlerow) ))  
        if 'Active Cases*' in data:
            mytabledata.append(data)

### Step 2 == > Data Transformation / Munging / wragling
#### ========== >> Coverting "mytabledata Into" our DataFrame & doing Data Cleaning << ======================================

In [16]:
state_data = pd.DataFrame(mytabledata, columns=headings)
new_cols = ["Sr.No","States/UT","Active Cases","Active Cases Since Yesterday",\
            "Recovered","Recovered Cases Since Yesterday","Deceased","Deceased Cases Since Yesterday"]
state_data.columns = new_cols
state_data = state_data.set_index('Sr.No')
state_data.loc['Total#']['Deceased'] = state_data.loc['Total#']['Recovered']
state_data.loc['Total#']['Recovered'] = state_data.loc['Total#']['Active Cases']
state_data.loc['Total#']['Active Cases'] = state_data.loc['Total#']['States/UT']
print(str(datetime_obj.strftime("%d-%m-%Y")))
print(type(str(datetime_obj.strftime("%d-%m-%Y"))))
state_data['Date'] = str(datetime_obj) #.strftime("%d-%m-%Y"))
print(type(state_data['Date']))
# state_data['Date'] = datetime.now().date()

# state_data['Date'] = pd.to_datetime(state_data['Date'])
# df_new = state_data.rename(columns={'A': 'a'}, index={'ONE': 'one'})   #for renaming columns
state_data.rename(index={'Total#': 'Total'},inplace=True)

12-08-2020
<class 'str'>
<class 'pandas.core.series.Series'>


### Step 3  ==> Data cleaning

In [17]:
state_data.replace(['',np.nan], 0,inplace=True)
cols_to_include =[i for i in state_data.columns if i not in ['States/UT','Date']]
state_data[cols_to_include] = state_data[cols_to_include].astype('int')
state_data['States/UT'] = state_data['States/UT'].astype('str')
state_data['Deceased Cases Since Yesterday'] = state_data['Deceased Cases Since Yesterday'].abs()
# state_data.loc['35']['Deceased Cases Since Yesterday'] = sum(state_data['Deceased Cases Since Yesterday'])
state_data.iat[35, 6] = sum(state_data['Deceased Cases Since Yesterday'])
# state_data.at[34,'Deceased Cases Since Yesterday'] = sum(state_data['Deceased Cases Since Yesterday'])
# print(state_data['Deceased Cases Since Yesterday'][0])# = sum(state_data['Deceased Cases Since Yesterday'])

state_data

Unnamed: 0_level_0,States/UT,Active Cases,Active Cases Since Yesterday,Recovered,Recovered Cases Since Yesterday,Deceased,Deceased Cases Since Yesterday,Date
Sr.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Andaman and Nicobar Islands,994,98,749,40,21,1,2020-08-12
2,Andhra Pradesh,87597,-176,154749,9113,2203,87,2020-08-12
3,Arunachal Pradesh,690,54,1634,42,3,0,2020-08-12
4,Assam,19178,1178,45073,1487,155,4,2020-08-12
5,Bihar,29291,1226,56709,2621,413,16,2020-08-12
6,Chandigarh,629,63,1015,11,26,1,2020-08-12
7,Chhattisgarh,3586,250,9239,226,104,5,2020-08-12
8,Dadra and Nagar Haveli and Daman and Diu,442,-2,1209,42,2,0,2020-08-12
9,Delhi,10868,522,132384,727,4139,8,2020-08-12
10,Goa,2878,137,6480,272,86,6,2020-08-12


### Step 4 ==> Data Exporting  
#### Saving Dataframe (state_data) into Excel Sheet or CSV file

In [18]:
from datetime import datetime
def custom_strftime(format,date_obj):
    suffix = (lambda X:'th' if 11<=X<=13 else {1:'st',2:'nd',3:'rd'}.get(X%10, 'th'))(int(date_obj.day))
    return date_obj.strftime(format).replace('{S}', str(date_obj.day) + str(suffix))
# datetime_obj  = datetime.now()
file_name = "COVID19_{}".format(custom_strftime('{S}%b',datetime_obj))

state_data.to_excel( r'c:/Users/mohan/OneDrive/Desktop/projects/{}.xlsx'.format(file_name),sheet_name = 'COVID19 State Data')
state_data.to_csv( r'c:/Users/mohan/OneDrive/Desktop/projects/{}.csv'.format(file_name))
state_data

Unnamed: 0_level_0,States/UT,Active Cases,Active Cases Since Yesterday,Recovered,Recovered Cases Since Yesterday,Deceased,Deceased Cases Since Yesterday,Date
Sr.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Andaman and Nicobar Islands,994,98,749,40,21,1,2020-08-12
2,Andhra Pradesh,87597,-176,154749,9113,2203,87,2020-08-12
3,Arunachal Pradesh,690,54,1634,42,3,0,2020-08-12
4,Assam,19178,1178,45073,1487,155,4,2020-08-12
5,Bihar,29291,1226,56709,2621,413,16,2020-08-12
6,Chandigarh,629,63,1015,11,26,1,2020-08-12
7,Chhattisgarh,3586,250,9239,226,104,5,2020-08-12
8,Dadra and Nagar Haveli and Daman and Diu,442,-2,1209,42,2,0,2020-08-12
9,Delhi,10868,522,132384,727,4139,8,2020-08-12
10,Goa,2878,137,6480,272,86,6,2020-08-12
