In [1]:
#Importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from glob import glob
from functools import reduce
from datetime import datetime
import os
import geopandas as gpd
import json
import numpy as np
import plotly.express as px


In [2]:
#Load the dataset
df = pd.read_csv("districts.csv")

In [3]:
#Dropping irrevelant columns in df
df = df.drop(['Other','Tested'], axis=1)
df

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased
0,2020-04-26,Andaman and Nicobar Islands,Unknown,33,11,0
1,2020-04-26,Andhra Pradesh,Anantapur,53,14,4
2,2020-04-26,Andhra Pradesh,Chittoor,73,13,0
3,2020-04-26,Andhra Pradesh,East Godavari,39,12,0
4,2020-04-26,Andhra Pradesh,Guntur,214,29,8
...,...,...,...,...,...,...
238283,2021-05-02,West Bengal,Purba Bardhaman,20231,16668,111
238284,2021-05-02,West Bengal,Purba Medinipur,28445,24202,299
238285,2021-05-02,West Bengal,Purulia,14067,10893,65
238286,2021-05-02,West Bengal,South 24 Parganas,54352,45764,800


# Function to generate the dictionary holding datframes for each district

In [4]:
# district_generator is the function to create and update the dataframes for each district in the dictionary, dictionary_dataframe
def district_generator(district, state, dictionary):
    
    # Create a dataframe with required attributes for the district
    district_dataframe = pd.DataFrame(columns = ['Date', 'State', 'District', 'Confirmed', 'Recovered', 'Deceased'])
    district_dataframe['Date'] = pd.date_range(start='01/01/2020', end='31/12/2020')    # adding dates for each day of the year
    district_dataframe['District'] = district                                           # update district as the input district
    district_dataframe['State'] = state                                                 # update state as the input state


    extracted_state = df.loc[df["State"] == state]                                      # extract only records with given state
    extracted_district = extracted_state.loc[extracted_state["District"] == district]   # extract only records with given district
    
    
    
    # change the datatype of the Date column to datetime to make comparisions efficient
    try:
        extracted_district['Date'] = pd.to_datetime(extracted_district['Date'], format = '%Y-%m-%d')
    except:
        extracted_district['Date'] = pd.to_datetime(extracted_district['Date'], format = '%Y/%m%d')

    extracted_district.reset_index()
    
    df1 = pd.concat([district_dataframe, extracted_district]).drop_duplicates(subset = ['Date'] , keep='last')
    
    
    df1 = df1.sort_values(by=["Date"])                                                  # sorting by the dates
    
    
    # Every district will have dates where records are missing, such cells are filled as DNA that is "Data Not Available"
    df1[["Confirmed", "Recovered", "Deceased"]] = df1[["Confirmed", "Recovered", "Deceased"]].fillna('DNA')
        
    #df1[["Confirmed", "Recovered", "Deceased"]] = df1[["Confirmed", "Recovered", "Deceased"]].diff()
    #df1 = df1.reset_index(drop=True)
    #df1.at[0,['Confirmed','Recovered', 'Deceased']] = 0
    #df1 = df1.astype({"Confirmed": int, "Recovered": int, "Deceased": int})
    dictionary[district] = df1
    
    

    

# Five states will be considered for the Data Pre Processing

* Karnataka
* Maharashtra
* Kerala
* Tamil Nadu
* Andhra Pradesh

## State : Karnataka 

In [5]:
# All the districts in Karnataka are included in the list karnataka_district_list  

karnataka_district_list = ['Bagalkote', 'Ballari', 'Belagavi', 'Bengaluru Rural', 'Bengaluru Urban', 'Bidar', 'Chamarajanagara', 'Chikkaballapura', 'Chikkamagaluru', 'Chitradurga', 'Dakshina Kannada', 'Davanagere', 'Dharwad', 'Gadag', 'Hassan', 'Haveri', 'Kalaburagi', 'Kodagu', 'Kolar', 'Koppal', 'Mandya', 'Mysuru', 'Raichur', 'Ramanagara', 'Shivamogga', 'Tumakuru', 'Udupi', 'Uttara Kannada', 'Vijayapura', 'Yadgir']

print("\n Total number of districts in Karnataka = ", len(karnataka_district_list))
print("\n The districts of Karnataka in alphabetical order: ",karnataka_district_list)

# A dictionary named karnataka_districts is created to hold the dataframe for each district in Karnataka
karnataka_districts = dict()


# Looping through each district in karnataka_district_list so as to update the dictionary
for d in karnataka_district_list:
    district_generator(d, 'Karnataka', karnataka_districts)



 Total number of districts in Karnataka =  30

 The districts of Karnataka in alphabetical order:  ['Bagalkote', 'Ballari', 'Belagavi', 'Bengaluru Rural', 'Bengaluru Urban', 'Bidar', 'Chamarajanagara', 'Chikkaballapura', 'Chikkamagaluru', 'Chitradurga', 'Dakshina Kannada', 'Davanagere', 'Dharwad', 'Gadag', 'Hassan', 'Haveri', 'Kalaburagi', 'Kodagu', 'Kolar', 'Koppal', 'Mandya', 'Mysuru', 'Raichur', 'Ramanagara', 'Shivamogga', 'Tumakuru', 'Udupi', 'Uttara Kannada', 'Vijayapura', 'Yadgir']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_district['Date'] = pd.to_datetime(extracted_district['Date'], format = '%Y-%m-%d')


In [6]:
# Checking for the district of Kolar in Karnataka

karnataka_districts['Kolar'][250:260]

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased
82941,2020-09-07,Karnataka,Kolar,4055,3281,66
83595,2020-09-08,Karnataka,Kolar,4085,3350,68
84249,2020-09-09,Karnataka,Kolar,4182,3403,69
84903,2020-09-10,Karnataka,Kolar,4286,3449,71
85557,2020-09-11,Karnataka,Kolar,4356,3510,71
86211,2020-09-12,Karnataka,Kolar,4409,3580,73
86865,2020-09-13,Karnataka,Kolar,4482,3632,75
87519,2020-09-14,Karnataka,Kolar,4539,3710,75
88173,2020-09-15,Karnataka,Kolar,4608,3795,77
88827,2020-09-16,Karnataka,Kolar,4709,3849,78


## State : Maharashtra

In [7]:
# All the districts in Maharashtra are included in the list maharashtra_district_list  

maharashtra_district_list = ['Ahmednagar', 'Akola', 'Amravati', 'Aurangabad', 'Beed', 'Bhandara', 'Buldhana', 'Chandrapur', 'Dhule', 'Gadchiroli', 'Gondia', 'Hingoli', 'Jalgaon', 'Jalna', 'Kolhapur', 'Latur', 'Mumbai', 'Nagpur', 'Nanded', 'Nandurbar', 'Nashik', 'Osmanabad', 'Palghar', 'Parbhani', 'Pune', 'Raigad', 'Ratnagiri', 'Sangli', 'Satara', 'Sindhudurg', 'Solapur', 'Thane', 'Wardha', 'Washim', 'Yavatmal']

print("\n Total number of districts in Maharashtra = ", len(maharashtra_district_list))
print("\n The districts of Maharashtra in alphabetical order: ",maharashtra_district_list)

# A dictionary named maharashtra_districts is created to hold the dataframe for each district in Maharashtra
maharashtra_districts = dict()


# Looping through each district in maharashtra_district_list so as to update the dictionary
for d in maharashtra_district_list:
    district_generator(d, 'Maharashtra', maharashtra_districts)



 Total number of districts in Maharashtra =  35

 The districts of Maharashtra in alphabetical order:  ['Ahmednagar', 'Akola', 'Amravati', 'Aurangabad', 'Beed', 'Bhandara', 'Buldhana', 'Chandrapur', 'Dhule', 'Gadchiroli', 'Gondia', 'Hingoli', 'Jalgaon', 'Jalna', 'Kolhapur', 'Latur', 'Mumbai', 'Nagpur', 'Nanded', 'Nandurbar', 'Nashik', 'Osmanabad', 'Palghar', 'Parbhani', 'Pune', 'Raigad', 'Ratnagiri', 'Sangli', 'Satara', 'Sindhudurg', 'Solapur', 'Thane', 'Wardha', 'Washim', 'Yavatmal']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_district['Date'] = pd.to_datetime(extracted_district['Date'], format = '%Y-%m-%d')


In [8]:
# Checking for the district of Mumbai in Maharashtra

#maharashtra_districts['Mumbai'].astype({'Confirmed' : int, "Recovered": int, 'Deceased' : int})


maharashtra_districts['Sindhudurg'][165:172]

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased
27430,2020-06-14,Maharashtra,Sindhudurg,150,78,0
28084,2020-06-15,Maharashtra,Sindhudurg,156,83,0
28736,2020-06-16,Maharashtra,Sindhudurg,156,91,3
29387,2020-06-17,Maharashtra,Sindhudurg,158,91,3
30038,2020-06-18,Maharashtra,Sindhudurg,161,99,3
30689,2020-06-19,Maharashtra,Sindhudurg,161,114,3
31341,2020-06-20,Maharashtra,Sindhudurg,163,125,4


## State : Kerala

In [9]:
# All the districts in Kerala are included in the list kerala_district_list

kerala_district_list = ['Alappuzha', 'Ernakulam', 'Idukki', 'Kannur', 'Kasaragod', 'Kollam', 'Kottayam', 'Kozhikode', 'Malappuram', 'Palakkad', 'Pathanamthitta', 'Thiruvananthapuram', 'Thrissur', 'Wayanad']

print("\n Total number of districts in Kerala = ", len(kerala_district_list))
print("\n The districts of Kerala in alphabetical order: ",kerala_district_list)

# A dictionary named kerala_districts is created to hold the dataframe for each district in Kerala
kerala_districts = dict()


# Looping through each district in kerala_district_list so as to update the dictionary
for d in kerala_district_list:
    district_generator(d, 'Kerala', kerala_districts)



 Total number of districts in Kerala =  14

 The districts of Kerala in alphabetical order:  ['Alappuzha', 'Ernakulam', 'Idukki', 'Kannur', 'Kasaragod', 'Kollam', 'Kottayam', 'Kozhikode', 'Malappuram', 'Palakkad', 'Pathanamthitta', 'Thiruvananthapuram', 'Thrissur', 'Wayanad']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_district['Date'] = pd.to_datetime(extracted_district['Date'], format = '%Y-%m-%d')


In [10]:
# Checking for the district of Thrissur in Kerala

kerala_districts['Thrissur'][280:290] 

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased
102593,2020-10-07,Kerala,Thrissur,18759,10229,59
103247,2020-10-08,Kerala,Thrissur,19144,10689,59
103901,2020-10-09,Kerala,Thrissur,19899,11549,60
104555,2020-10-10,Kerala,Thrissur,21107,12059,60
105209,2020-10-11,Kerala,Thrissur,22067,12639,63
105863,2020-10-12,Kerala,Thrissur,22764,13729,63
106517,2020-10-13,Kerala,Thrissur,23774,14379,63
107171,2020-10-14,Kerala,Thrissur,24355,15010,63
107825,2020-10-15,Kerala,Thrissur,25222,15560,64
108479,2020-10-16,Kerala,Thrissur,26031,16391,69


## State : Tamil Nadu

In [11]:
# All the districts in Tamil Nadu are included in the list tamilnadu_district_list

tamilnadu_district_list = ['Ariyalur', 'Chengalpattu', 'Chennai', 'Coimbatore', 'Cuddalore', 'Dharmapuri', 'Dindigul', 'Erode', 'Kallakurichi', 'Kancheepuram', 'Kanyakumari', 'Karur', 'Krishnagiri', 'Madurai', 'Nagapattinam', 'Namakkal', 'Nilgiris', 'Perambalur', 'Pudukkottai', 'Ramanathapuram', 'Ranipet', 'Salem', 'Sivaganga', 'Tenkasi', 'Thanjavur', 'Theni', 'Thiruvallur', 'Thiruvarur', 'Thoothukkudi', 'Tiruchirappalli', 'Tirunelveli', 'Tirupathur', 'Tiruppur', 'Tiruvannamalai', 'Vellore', 'Viluppuram', 'Virudhunagar']

print("\n Total number of districts in Tamil Nadu = ", len(tamilnadu_district_list))
print("\n The districts of Tamil Nadu in alphabetical order: ",tamilnadu_district_list)

# A dictionary named tamilnadu_districts is created to hold the dataframe for each district in Tamil Nadu
tamilnadu_districts = dict()


# Looping through each district in tamilnadu_district_list so as to update the dictionary
for d in tamilnadu_district_list:
    district_generator(d, 'Tamil Nadu', tamilnadu_districts)


 Total number of districts in Tamil Nadu =  37

 The districts of Tamil Nadu in alphabetical order:  ['Ariyalur', 'Chengalpattu', 'Chennai', 'Coimbatore', 'Cuddalore', 'Dharmapuri', 'Dindigul', 'Erode', 'Kallakurichi', 'Kancheepuram', 'Kanyakumari', 'Karur', 'Krishnagiri', 'Madurai', 'Nagapattinam', 'Namakkal', 'Nilgiris', 'Perambalur', 'Pudukkottai', 'Ramanathapuram', 'Ranipet', 'Salem', 'Sivaganga', 'Tenkasi', 'Thanjavur', 'Theni', 'Thiruvallur', 'Thiruvarur', 'Thoothukkudi', 'Tiruchirappalli', 'Tirunelveli', 'Tirupathur', 'Tiruppur', 'Tiruvannamalai', 'Vellore', 'Viluppuram', 'Virudhunagar']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_district['Date'] = pd.to_datetime(extracted_district['Date'], format = '%Y-%m-%d')


In [12]:
# Checking for the district of Coimbatore in Tamil Nadu

tamilnadu_districts['Namakkal'][110:140]

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased
110,2020-04-20,Tamil Nadu,Namakkal,DNA,DNA,DNA
111,2020-04-21,Tamil Nadu,Namakkal,DNA,DNA,DNA
112,2020-04-22,Tamil Nadu,Namakkal,DNA,DNA,DNA
113,2020-04-23,Tamil Nadu,Namakkal,DNA,DNA,DNA
114,2020-04-24,Tamil Nadu,Namakkal,DNA,DNA,DNA
115,2020-04-25,Tamil Nadu,Namakkal,DNA,DNA,DNA
305,2020-04-26,Tamil Nadu,Namakkal,59,45,0
724,2020-04-27,Tamil Nadu,Namakkal,59,45,0
1153,2020-04-28,Tamil Nadu,Namakkal,61,45,0
1587,2020-04-29,Tamil Nadu,Namakkal,61,49,0


## State : Andhra Pradesh

In [13]:
# All the districts in Andhra Pradesh are included in the list andhrapradesh_district_list

andhrapradesh_district_list = ['Anantapur', 'Chittoor', 'East Godavari', 'Guntur', 'Krishna', 'Kurnool', 'Prakasam', 'S.P.S. Nellore', 'Srikakulam', 'Visakhapatnam', 'Vizianagaram', 'West Godavari', 'Y.S.R. Kadapa']

print("\n Total number of districts in Andhra Pradesh = ", len(andhrapradesh_district_list))
print("\n The districts of Andhra Pradesh in alphabetical order: ",andhrapradesh_district_list)

# A dictionary named andhrapradesh_districts is created to hold the dataframe for each district in Andhra Pradesh
andhrapradesh_districts = dict()


# Looping through each district in andhrapradesh_district_list so as to update the dictionary
for d in andhrapradesh_district_list:
    district_generator(d, 'Andhra Pradesh', andhrapradesh_districts)



 Total number of districts in Andhra Pradesh =  13

 The districts of Andhra Pradesh in alphabetical order:  ['Anantapur', 'Chittoor', 'East Godavari', 'Guntur', 'Krishna', 'Kurnool', 'Prakasam', 'S.P.S. Nellore', 'Srikakulam', 'Visakhapatnam', 'Vizianagaram', 'West Godavari', 'Y.S.R. Kadapa']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_district['Date'] = pd.to_datetime(extracted_district['Date'], format = '%Y-%m-%d')


In [14]:
# Checking for the district of Guntur in Andhra Pradesh

andhrapradesh_districts['Guntur'][280:290]

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased
102327,2020-10-07,Andhra Pradesh,Guntur,58106,52377,551
102981,2020-10-08,Andhra Pradesh,Guntur,58599,52983,554
103635,2020-10-09,Andhra Pradesh,Guntur,58941,53372,555
104289,2020-10-10,Andhra Pradesh,Guntur,59411,54029,557
104943,2020-10-11,Andhra Pradesh,Guntur,59842,54349,560
105597,2020-10-12,Andhra Pradesh,Guntur,60221,54902,564
106251,2020-10-13,Andhra Pradesh,Guntur,60612,55473,567
106905,2020-10-14,Andhra Pradesh,Guntur,60957,55863,570
107559,2020-10-15,Andhra Pradesh,Guntur,61347,56362,573
108213,2020-10-16,Andhra Pradesh,Guntur,61593,56608,576


In [15]:
Bengaluru = df.loc[df['District'] == 'Bengaluru Urban']
Bengaluru.to_csv('Bengaluru.csv',index = False)

In [16]:
Chennai = df.loc[df['District'] == 'Chennai']
Chennai.to_csv('Chennai.csv',index = False)

In [17]:
Thiruvananthapuram = df.loc[df['District'] == 'Thiruvananthapuram']
Thiruvananthapuram.to_csv('Thiruvananthapuram.csv',index = False)

In [18]:
Mumbai = df.loc[df['District']=='Mumbai']
Mumbai.to_csv('Mumbai.csv',index = False)

In [19]:
Delhi = df.loc[df['District'] == 'Delhi']
Delhi.to_csv('Delhi.csv',index = False)

In [20]:
Delhi

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased
43,2020-04-26,Delhi,Delhi,2918,877,54
454,2020-04-27,Delhi,Delhi,3108,877,54
879,2020-04-28,Delhi,Delhi,3314,1078,54
1309,2020-04-29,Delhi,Delhi,3439,1092,56
1743,2020-04-30,Delhi,Delhi,3515,1094,59
...,...,...,...,...,...,...
235126,2021-04-28,Delhi,Delhi,1098051,982922,15377
235780,2021-04-29,Delhi,Delhi,1122286,1008537,15772
236434,2021-04-30,Delhi,Delhi,1149333,1033825,16147
237088,2021-05-01,Delhi,Delhi,1174552,1061246,16559


In [34]:
delhi_districts['Delhi'].to_csv('Delhi.csv',index = False)