## Data Source

https://api.covid19india.org/

## Import libraries

In [4]:
# to get web contents
import requests
# to parse json contents
import json
# to parse csv files
import csv

# for numerical operations
import numpy as np
# to store and analysis data in dataframes
import pandas as pd

## Get data

In [5]:
# df_1 - Till Apr 19
# ==================

# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data1.json')
# get contents from the response
content = response.content
# parse the json file
parsed = json.loads(content)
# keys
parsed.keys()

dict_keys(['raw_data'])

In [6]:
# save to df
df_1 = pd.DataFrame(parsed['raw_data'])

# shape of the dataframe
print(df_1.shape)

# # list of columns
print(df_1.columns)

# # first few rows
# df_1.head(2)

(17306, 20)
Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission'],
      dtype='object')


In [7]:
# df_2 - Till Apr 26
# ==================

# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data2.json')
# get contents from the response
content = response.content
# parse the json file
parsed = json.loads(content)
# keys
parsed.keys()

dict_keys(['raw_data'])

In [8]:
# save to df
df_2 = pd.DataFrame(parsed['raw_data'])

# shape of the dataframe
print(df_2.shape)

# # list of columns
print(df_2.columns)

# # first few rows
# df_2.head(2)

(10585, 20)
Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission'],
      dtype='object')


In [9]:
# df_3 - Live
# ===========

# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data3.json')
# get contents from the response
content = response.content
# parse the json file
parsed = json.loads(content)
# keys
parsed.keys()

dict_keys(['raw_data'])

In [10]:
# save to df
df_3 = pd.DataFrame(parsed['raw_data'])

# shape of the dataframe
print(df_3.shape)

# # list of columns
print(df_3.columns)

# # first few rows
# df_3.head(2)

(5065, 20)
Index(['agebracket', 'contractedfromwhichpatientsuspected', 'currentstatus',
       'dateannounced', 'detectedcity', 'detecteddistrict', 'detectedstate',
       'entryid', 'gender', 'nationality', 'notes', 'numcases',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission'],
      dtype='object')


In [18]:
# np.setdiff1d(df_2.columns, df_1.columns)

array([], dtype=object)

In [16]:
# np.setdiff1d(df_1.columns, df_3.columns)

array(['backupnotes', 'estimatedonsetdate'], dtype=object)

In [20]:
np.setdiff1d(df_3.columns, df_1.columns)

array(['entryid', 'numcases'], dtype=object)

In [22]:
df_3.numcases

0       38
1        2
2        9
3        1
4        1
        ..
5060      
5061      
5062      
5063      
5064      
Name: numcases, Length: 5065, dtype: object

In [11]:
# full data
# =========

# drop unwanted rows
df_1 = df_1.drop('_dnp34', axis = 1)

# rename columns
# df_3 = df_3.rename({'entryid' : 'patientnumber'})

# concatenate data
df = pd.concat([df_1, df_2, df_3])

# shape of the data
df.shape

KeyError: "['_dnp34'] not found in axis"

In [None]:
# list of columns
df.columns

In [None]:
# first 3 rows of the dataframe
df.head(3)

In [None]:
# creating patient id column from patient number
# ===============================================

df['p_id'] = df['patientnumber'].apply(lambda x : 'P'+str(x))
df.columns

## Rearrange and rename columns

In [None]:
# order of columns
cols = ['patientnumber', 'p_id', 'statepatientnumber', 
        'dateannounced', 'agebracket', 'gender', 
        'detectedcity', 'detecteddistrict', 'detectedstate', 'statecode', 'nationality',
        'typeoftransmission', 'contractedfromwhichpatientsuspected',
        'statuschangedate', 'currentstatus', 'estimatedonsetdate',
        'source1', 'source2', 'source3', 'notes', 'backupnotes', 'entryid', 'numcases']

# rearrange columns
df = df[cols]

# rename columns
df.columns = ['patient_number', 'p_id', 'state_patient_number', 
              'date_announced', 'age_bracket', 'gender', 
              'detected_city', 'detected_district', 'detected_state', 'state_code', 'nationality',
              'type_of_transmission', 'contracted_from_which_patient_suspected',
              'status_change_date', 'current_status', 'estimated_onset_date',
              'source1', 'source2', 'source3', 'notes', 'backup_notes', 'entryid', 'numcases']

# dataframe shape
df.shape

In [None]:
# first 3 rows of the dataframe
df.head(3)

## Missing values

In [None]:
# no. of empty values in each column
# ==================================

print(df.shape, '\n')

for i in df.columns:
    print(i, '\t', df[df[i]==''].shape[0])

In [None]:
# no. of non-empty values in each column
# ===================================

print(df.shape, '\n')

for i in df.columns:
    print(i, '\t', df[df[i]!=''].shape[0])

In [None]:
# replacing empty strings with np.nan
# ==================================-

print(df.shape)

df = df.replace(r'', np.nan, regex=True)
df.isna().sum()

In [None]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

print(df.shape)

# df.dropna(subset=['detected_state'], inplace=True)

print(df.shape)
df.isna().sum()

## Save data

In [None]:
# save to csv`
df.to_csv('patients_data.csv', index=False)

## State tested data

In [None]:
response = requests.get('https://api.covid19india.org/state_test_data.json')
content = response.content
parsed = json.loads(content)
parsed.keys()

In [None]:
th = pd.DataFrame(parsed['states_tested_data'])
th.head(3)

## Zones

In [None]:
response = requests.get('https://api.covid19india.org/zones.json')
content = response.content
parsed = json.loads(content)
parsed.keys()

In [None]:
th = pd.DataFrame(parsed['zones'])
th.head(3)

## State wise Daily

In [None]:
# response = requests.get('https://api.covid19india.org/states_daily.json')
# content = response.content
# parsed = json.loads(content)

# df = pd.DataFrame(parsed['states_daily'])

In [None]:
# df = df.melt(id_vars = ['date', 'status'], 
#              value_vars = ['an', 'ap', 'ar', 'as', 'br', 'ch', 'ct', 'dd', 
#                     'dl', 'dn', 'ga', 'gj', 'hp', 'hr', 'jh', 'jk', 
#                     'ka', 'kl', 'la', 'ld', 'mh', 'ml', 'mn', 'mp',
#                     'mz', 'nl', 'or', 'pb', 'py', 'rj', 'sk', 'tg', 
#                     'tn', 'tr', 'tt', 'up', 'ut', 'wb'], 
#              var_name='state', value_name='count')

# df = df.set_index(['date', 'state'])

# df = df.pivot(columns = 'status').reset_index()

# df.columns = df.columns.droplevel(0)
# df.columns.name = ''

# df.columns = ['Date', 'State', 'Confirmed', 'Deceased', 'Recovered']
# df.head()

In [None]:
# response = requests.get('https://api.covid19india.org/csv/')
# parsed = response.content.decode('utf-8')
# parsed

# df = pd.DataFrame(parsed, sep=',')
# df.head()

In [None]:
# pd.DataFrame('http://api.covid19india.org/states_daily_csv/confirmed.csv')

In [None]:
# pd.read_csv('https://api.covid19india.org/csv/')

## States Daily changes

In [None]:
# response = requests.get('https://api.covid19india.org/states_daily.json')
# content = response.content
# parsed = json.loads(content)

In [None]:
# pd.DataFrame(parsed['states_daily'])

## National time series, statewise stats and test counts

In [None]:
# response = requests.get('https://api.covid19india.org/data.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [None]:
# day_wise = pd.DataFrame(parsed['cases_time_series'])
# day_wise.head()

In [None]:
# state_wise = pd.DataFrame(parsed['statewise'])
# state_wise.head()

In [None]:
# tested = pd.DataFrame(parsed['tested'])
# tested.head()

## District wise

In [None]:
# response = requests.get('https://api.covid19india.org/state_district_wise.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [None]:
# pd.DataFrame(parsed['Goa'])

In [None]:
# parsed['Goa'].keys()

In [None]:
# pd.DataFrame(parsed['Goa']['districtData'])

## District wise v2

In [None]:
# response = requests.get('https://api.covid19india.org/v2/state_district_wise.json')
# content = response.content
# parsed = json.loads(content)
# len(parsed)

In [None]:
# pd.DataFrame(parsed[1]['districtData'])

## Travel history (no more updated)

In [None]:
# response = requests.get('https://api.covid19india.org/travel_history.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [None]:
# th = pd.DataFrame(parsed['travel_history'])
# th.head()