In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("covid_data.csv")

In [3]:
# Rename columns for convenience
df.columns = ['serial', 'obsv_date', 'province_state', 'country_region', 'last_update', 'confirmed', 'deaths', 'recovered']

# Set serial number as index
df = df.set_index('serial')

# Convert floats to integers
df.iloc[:,-3:] = df.iloc[:,-3:].astype(int)

# Strip whitespace
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Rename values
df = df.replace({'country_region': {"Bahamas, The": "Bahamas", 
                                    "The Bahamas": "Bahamas",
                                    "(\'St. Martin\',)": "St. Martin",
                                    "UK": "United Kingdom",
                                    "US": "United States", 
                                    " Azerbaijan": "Azerbaijan",
                                    "Gambia, The": "Gambia",
                                    "occupied Palestinian territory": "Occupied Palestinian Territory"},
                 'province_state': {"Grand Princess Cruise Ship": "Grand Princess"}})

# Remove duplicates
df = df.drop_duplicates()

# Remove non country/regions
df = df[df['country_region'] != 'MS Zaandam']

# Convert observation_date and last_update to datetime format
df['obsv_date'] = pd.to_datetime(df['obsv_date'])
df['last_update'] = pd.to_datetime(df['last_update'])

# Remove rows with all zeros
df = df[~((df['confirmed'] == 0) & (df['deaths'] == 0) & (df['recovered'] == 0))]

# Check for outliers
df[(df['confirmed'] < 0) | (df['deaths'] < 0) | (df['recovered'] < 0)]

# Drop Colombia:Unknown data
df = df[~((df['country_region'] == 'Colombia') & (df['province_state'] == "Unknown"))]

In [16]:
df = df[df['country_region'] == 'India']
df = df.sort_values(['province_state', 'obsv_date'])
df['lag'] = df['confirmed'].shift(1).fillna(0)
df['New_Cases'] = df['confirmed'] - df['lag']
df = df[['province_state', 'obsv_date', 'confirmed', 'New_Cases']]

index = df[df['obsv_date'] == '2020-06-10'].index
for x in index:
    df.loc[x, 'New_Cases'] = 0

df.iloc[:,-2:] = df.iloc[:,-2:].astype(int)
df

Unnamed: 0_level_0,province_state,obsv_date,confirmed,New_Cases
serial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
39261,Andaman and Nicobar Islands,2020-06-10,33,0
39989,Andaman and Nicobar Islands,2020-06-11,38,5
40722,Andaman and Nicobar Islands,2020-06-12,38,0
41455,Andaman and Nicobar Islands,2020-06-13,38,0
42188,Andaman and Nicobar Islands,2020-06-14,38,0
...,...,...,...,...
35702,,2020-06-05,236184,9471
36390,,2020-06-06,246622,10438
37078,,2020-06-07,257486,10864
37766,,2020-06-08,265928,8442


In [11]:
df['province_state'].unique()

array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadar Nagar Haveli',
       'Dadra and Nagar Haveli and Daman and Diu', 'Delhi', 'Goa',
       'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir',
       'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh', 'Lakshadweep',
       'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram',
       'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan',
       'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Unknown',
       'Uttar Pradesh', 'Uttarakhand', 'West Bengal', nan], dtype=object)

In [17]:
print(df[df['province_state'] =='Kerala'].to_string())

       province_state  obsv_date  confirmed  New_Cases
serial                                                
39469          Kerala 2020-06-10       2096          0
40198          Kerala 2020-06-11       2244        148
40931          Kerala 2020-06-12       2244          0
41664          Kerala 2020-06-13       2322         78
42397          Kerala 2020-06-14       2407         85
43130          Kerala 2020-06-15       2543        136
43863          Kerala 2020-06-16       2622         79
44596          Kerala 2020-06-17       2697         75
45329          Kerala 2020-06-18       2794         97
46062          Kerala 2020-06-19       2912        118
46795          Kerala 2020-06-20       3039        127
47528          Kerala 2020-06-21       3172        133
48261          Kerala 2020-06-22       3310        138
48994          Kerala 2020-06-23       3451        141
49727          Kerala 2020-06-24       3603        152
50460          Kerala 2020-06-25       3726        123
51193     

In [18]:
df.to_csv('india_data.csv')