<a href="https://colab.research.google.com/github/aarsanjani/meansquares/blob/master/weekly_CovidCases_TX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic: Investigate Covid-19 Texas (TX) data

This colab includes TX covid case data from JHU and moving avg trend analysis

In [1]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=b51af99a0e18aa6baab07474c97d6deaa16259919ed8e4e4ccea4b6932d500c4
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


## Import Libraries

In [2]:
import pandas as pd
from tqdm import tqdm

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
import wget
from pandas import Series, datetime
from pandas.plotting import scatter_matrix, autocorrelation_plot
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_curve, auc
import random
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture

  import pandas.util.testing as tm


## Mount Google drive


In [7]:
os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"
import warnings; warnings.simplefilter('ignore')

In [4]:
from google.colab import drive
# drive.mount('/content/drive')

In [8]:
location = "drive/Shared drives/the-mean-sqaures/the-mean-squares/Colab_Dataset/Dataset/"

In [9]:
!ls /content/drive/My\ Drive/MasterProject-Personal/data

ls: cannot access '/content/drive/My Drive/MasterProject-Personal/data': No such file or directory


# Data Load

## 1 Load County Population


In [10]:
county_population_US = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv',low_memory=False)
print(county_population_US.shape)

(3195, 4)


In [11]:
wget.download('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv')
county_population_US = pd.read_csv('covid_county_population_usafacts.csv',low_memory=False)
print(county_population_US.shape)




(3195, 4)


In [12]:
county_population_US.head(2)

Unnamed: 0,countyFIPS,County Name,State,population
0,0,Statewide Unallocated,AL,0
1,1001,Autauga County,AL,55869


## 2 Load Covid-19 case details (until July 12)

In [14]:
!ls '/content/drive/My Drive/MasterProject-Personal/data/'

ls: cannot access '/content/drive/My Drive/MasterProject-Personal/data/': No such file or directory


### Note about data:
John Hopkins university updates data every day hence we are pulling from repository directly

**US Confirmed url** :https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv

**US deaths url**: https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv

In [15]:
urls = ['https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv',
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv']

[wget.download(url) for url in urls]        

['time_series_covid19_confirmed_US.csv', 'time_series_covid19_deaths_US.csv']

In [16]:
confirmed_US = pd.read_csv('time_series_covid19_confirmed_US.csv',low_memory=False)
death_US = pd.read_csv('time_series_covid19_deaths_US.csv',low_memory=False)
print(confirmed_US.shape)
print(death_US.shape)
print(confirmed_US.head(2))
death_US.head(2)

(3340, 242)
(3340, 243)
        UID iso2 iso3  code3    FIPS  ... 9/4/20 9/5/20 9/6/20  9/7/20  9/8/20
0  84001001   US  USA    840  1001.0  ...   1355   1371   1377    1383    1385
1  84001003   US  USA    840  1003.0  ...   4513   4542   4569    4586    4609

[2 rows x 242 columns]


Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,...,7/31/20,8/1/20,8/2/20,8/3/20,8/4/20,8/5/20,8/6/20,8/7/20,8/8/20,8/9/20,8/10/20,8/11/20,8/12/20,8/13/20,8/14/20,8/15/20,8/16/20,8/17/20,8/18/20,8/19/20,8/20/20,8/21/20,8/22/20,8/23/20,8/24/20,8/25/20,8/26/20,8/27/20,8/28/20,8/29/20,8/30/20,8/31/20,9/1/20,9/2/20,9/3/20,9/4/20,9/5/20,9/6/20,9/7/20,9/8/20
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",55869,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,20,20,20,20,20,21,21,21,21,21,21,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",223234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,21,21,22,23,23,23,23,23,24,25,25,29,29,29,29,29,29,29,30,30,31,32,32,32,32,32,33,34,35,36,36,38,38,38,40,42,42,42,42,42


In [17]:
#print(len(mask_data['state_name'].unique()))
print(len(confirmed_US['Province_State'].unique()))
confirmed_US['Province_State'].unique()

58


array(['Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
       'California', 'Colorado', 'Connecticut', 'Delaware',
       'Diamond Princess', 'District of Columbia', 'Florida', 'Georgia',
       'Grand Princess', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma',
       'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Vermont', 'Virgin Islands', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

## Data cleaning

In [18]:
confirmed_US.columns[:11]

# Note: the first 11 columns contains UID, ios2,ios3, code ,FIPS, 'Admin2', 'Province_State',
      # 'Country_Region', 'Lat', 'Long_',Combined_Key', 'Population',


Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key'],
      dtype='object')

In [19]:
# date begins from 11th column
confirmed_dates = confirmed_US.columns[11:]
confirmed_dates

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '8/30/20', '8/31/20', '9/1/20', '9/2/20', '9/3/20', '9/4/20', '9/5/20',
       '9/6/20', '9/7/20', '9/8/20'],
      dtype='object', length=231)

In [20]:
death_US.columns[:12]

Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'Population'],
      dtype='object')

In [21]:
death_US.columns[10:]

Index(['Combined_Key', 'Population', '1/22/20', '1/23/20', '1/24/20',
       '1/25/20', '1/26/20', '1/27/20', '1/28/20', '1/29/20',
       ...
       '8/30/20', '8/31/20', '9/1/20', '9/2/20', '9/3/20', '9/4/20', '9/5/20',
       '9/6/20', '9/7/20', '9/8/20'],
      dtype='object', length=233)

In [22]:
death_dates = death_US.columns[12:]
death_dates

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '8/30/20', '8/31/20', '9/1/20', '9/2/20', '9/3/20', '9/4/20', '9/5/20',
       '9/6/20', '9/7/20', '9/8/20'],
      dtype='object', length=231)

#### Note: both the date values has the same beginning date 1/22/2020. Hence we can use either of the values

In [23]:
confirmed_df_long = confirmed_US.melt(
    id_vars=['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key'],
       value_vars=confirmed_dates,
       var_name = 'Date',
       value_name = 'Confirmed'
)

death_df_long = death_US.melt(
    id_vars=['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'Population'],
       value_vars=death_dates,
       var_name = 'Date',
       value_name = 'Deaths'
)

In [24]:
confirmed_df_long.tail(10)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed
771530,84056029,US,USA,840,56029.0,Park,Wyoming,US,44.521575,-109.585282,"Park, Wyoming, US",9/8/20,173
771531,84056031,US,USA,840,56031.0,Platte,Wyoming,US,42.132991,-104.966331,"Platte, Wyoming, US",9/8/20,7
771532,84056033,US,USA,840,56033.0,Sheridan,Wyoming,US,44.790489,-106.886239,"Sheridan, Wyoming, US",9/8/20,180
771533,84056035,US,USA,840,56035.0,Sublette,Wyoming,US,42.765583,-109.913092,"Sublette, Wyoming, US",9/8/20,50
771534,84056037,US,USA,840,56037.0,Sweetwater,Wyoming,US,41.659439,-108.882788,"Sweetwater, Wyoming, US",9/8/20,314
771535,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.58908,"Teton, Wyoming, US",9/8/20,450
771536,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,"Uinta, Wyoming, US",9/8/20,309
771537,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,"Unassigned, Wyoming, US",9/8/20,0
771538,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,"Washakie, Wyoming, US",9/8/20,111
771539,84056045,US,USA,840,56045.0,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",9/8/20,20


In [26]:
confirmed_df_long[confirmed_df_long['FIPS'] == 48113].tail(30)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed
674094,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,"Dallas, Texas, US",8/10/20,55255
677434,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,"Dallas, Texas, US",8/11/20,55553
680774,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,"Dallas, Texas, US",8/12/20,55787
684114,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,"Dallas, Texas, US",8/13/20,56428
687454,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,"Dallas, Texas, US",8/14/20,57313
690794,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,"Dallas, Texas, US",8/15/20,58067
694134,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,"Dallas, Texas, US",8/16/20,63428
697474,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,"Dallas, Texas, US",8/17/20,65278
700814,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,"Dallas, Texas, US",8/18/20,66065
704154,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,"Dallas, Texas, US",8/19/20,66464


In [27]:
death_df_long.tail(10)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population,Date,Deaths
771530,84056029,US,USA,840,56029.0,Park,Wyoming,US,44.521575,-109.585282,"Park, Wyoming, US",29194,9/8/20,1
771531,84056031,US,USA,840,56031.0,Platte,Wyoming,US,42.132991,-104.966331,"Platte, Wyoming, US",8393,9/8/20,1
771532,84056033,US,USA,840,56033.0,Sheridan,Wyoming,US,44.790489,-106.886239,"Sheridan, Wyoming, US",30485,9/8/20,1
771533,84056035,US,USA,840,56035.0,Sublette,Wyoming,US,42.765583,-109.913092,"Sublette, Wyoming, US",9831,9/8/20,1
771534,84056037,US,USA,840,56037.0,Sweetwater,Wyoming,US,41.659439,-108.882788,"Sweetwater, Wyoming, US",42343,9/8/20,2
771535,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.58908,"Teton, Wyoming, US",23464,9/8/20,1
771536,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,"Uinta, Wyoming, US",20226,9/8/20,2
771537,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,"Unassigned, Wyoming, US",0,9/8/20,0
771538,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,"Washakie, Wyoming, US",7805,9/8/20,6
771539,84056045,US,USA,840,56045.0,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",6927,9/8/20,0


## Check Texas data

In [28]:
state= ['Texas']#['California','New York']
confirmed_CA_df = confirmed_df_long[confirmed_df_long['Province_State'].isin(state)]
confirmed_CA_df.tail(5)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed
771149,84048499,US,USA,840,48499.0,Wood,Texas,US,32.787224,-95.382364,"Wood, Texas, US",9/8/20,420
771150,84048501,US,USA,840,48501.0,Yoakum,Texas,US,33.173202,-102.827643,"Yoakum, Texas, US",9/8/20,173
771151,84048503,US,USA,840,48503.0,Young,Texas,US,33.176597,-98.687909,"Young, Texas, US",9/8/20,236
771152,84048505,US,USA,840,48505.0,Zapata,Texas,US,27.001564,-99.169872,"Zapata, Texas, US",9/8/20,295
771153,84048507,US,USA,840,48507.0,Zavala,Texas,US,28.866172,-99.760508,"Zavala, Texas, US",9/8/20,264


In [29]:
confirmed_CA_df.Province_State.unique()

array(['Texas'], dtype=object)

## Merging Confirmed and Death data

In [30]:
full_table = confirmed_df_long.merge(
    right=death_df_long,
    how='left',
    on=[ 'UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key','Date']
)

full_table.head(10)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed,Population,Deaths
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",1/22/20,0,55869,0
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",1/22/20,0,223234,0
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,"Barbour, Alabama, US",1/22/20,0,24686,0
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,"Bibb, Alabama, US",1/22/20,0,22394,0
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,"Blount, Alabama, US",1/22/20,0,57826,0
5,84001011,US,USA,840,1011.0,Bullock,Alabama,US,32.100305,-85.712655,"Bullock, Alabama, US",1/22/20,0,10101,0
6,84001013,US,USA,840,1013.0,Butler,Alabama,US,31.753001,-86.680575,"Butler, Alabama, US",1/22/20,0,19448,0
7,84001015,US,USA,840,1015.0,Calhoun,Alabama,US,33.774837,-85.826304,"Calhoun, Alabama, US",1/22/20,0,113605,0
8,84001017,US,USA,840,1017.0,Chambers,Alabama,US,32.913601,-85.390727,"Chambers, Alabama, US",1/22/20,0,33254,0
9,84001019,US,USA,840,1019.0,Cherokee,Alabama,US,34.17806,-85.60639,"Cherokee, Alabama, US",1/22/20,0,26196,0


In [None]:
#full_table['Date'] = pd.to_datetime(full_table['Date'])


In [31]:
ship_data = full_table['Province_State'].str.contains('Grand Princess') | full_table['Province_State'].str.contains('Diamond Princess') | full_table['Province_State'].str.contains('Northern Mariana Islands') | full_table['Province_State'].str.contains('American Samoa') |full_table['Province_State'].str.contains('Guam') | full_table['Province_State'].str.contains('Virgin Islands')

full_ship = full_table[ship_data]


In [32]:
# Removing ship data from State data

full_table = full_table[~(ship_data)]

## Group data

In [33]:
full_grouped = full_table.groupby(['Date', 'Province_State','FIPS'])['Confirmed', 'Deaths'].sum().reset_index()

full_grouped.tail(5)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths
767839,9/8/20,Wyoming,56041.0,309,2
767840,9/8/20,Wyoming,56043.0,111,6
767841,9/8/20,Wyoming,56045.0,20,0
767842,9/8/20,Wyoming,80056.0,0,0
767843,9/8/20,Wyoming,90056.0,0,0


In [46]:
NY_full_grouped = full_grouped[full_grouped['Province_State'] == 'Texas']
NY_full_grouped[NY_full_grouped['FIPS'] == 48201.0]

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths
2789,2020-01-22,Texas,48201.0,0,0
6113,2020-01-23,Texas,48201.0,0,0
9437,2020-01-24,Texas,48201.0,0,0
12761,2020-01-25,Texas,48201.0,0,0
16085,2020-01-26,Texas,48201.0,0,0
...,...,...,...,...,...
754013,2020-09-04,Texas,48201.0,109834,2300
757337,2020-09-05,Texas,48201.0,110762,2327
760661,2020-09-06,Texas,48201.0,111525,2335
763985,2020-09-07,Texas,48201.0,111525,2335


In [47]:
full_grouped.shape

(767844, 5)

####  Adding new cases, new deaths by subtracting from previous day record

In [48]:
full_grouped_ = full_grouped.copy()

In [49]:
full_grouped = full_grouped_.copy()

In [50]:
def fixDate(x):
  arr = x.split('/')
  m = arr[0]
  d = arr[1]
  y = arr[2]

  if int(m) < 10:
    m = '0'+str(m)
  if int(d) < 10:
    d = '0'+str(d)
  return '20'+str(y)+'-'+m+'-'+d

In [51]:
full_grouped.head(3)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths
0,2020-01-22,Alabama,1001.0,0,0
1,2020-01-22,Alabama,1003.0,0,0
2,2020-01-22,Alabama,1005.0,0,0


Following cell is giving error

In [53]:
#Following cell is giving error
#full_grouped['Date'] = full_grouped['Date'].apply(lambda x: fixDate(x))

full_grouped.head(2)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths
0,2020-01-22,Alabama,1001.0,0,0
1,2020-01-22,Alabama,1003.0,0,0


In [54]:
def computeNewCases(FIPS):


  countyData = full_grouped[full_grouped['FIPS'] == FIPS]
  countyData = countyData.sort_values(by=['Date'])

  if countyData.Confirmed.max() > 0:
    temp = countyData.groupby(['Date'])['Confirmed', 'Deaths']
    temp = temp.sum().diff().reset_index()
    #print(temp)
    mask = temp['Date'] != temp['Date'].shift(1)

    #temp.loc[mask, 'Confirmed'] = np.nan
    #temp.loc[mask, 'Deaths'] = np.nan
    temp.columns = ['Date', 'New cases', 'New deaths']
    print(temp)
    countyData = pd.merge(countyData, temp, on=[ 'Date'])
    print(countyData)
    #print('********')
    # filling na with 0
    countyData = countyData.fillna(0)
    # fixing data types
    cols = ['New cases', 'New deaths']
    countyData[cols] = countyData[cols].astype('int') 
    return countyData


In [56]:
computeNewCases(48201)

           Date  New cases  New deaths
0    2020-01-22        NaN         NaN
1    2020-01-23        0.0         0.0
2    2020-01-24        0.0         0.0
3    2020-01-25        0.0         0.0
4    2020-01-26        0.0         0.0
..          ...        ...         ...
226  2020-09-04     1015.0        19.0
227  2020-09-05      928.0        27.0
228  2020-09-06      763.0         8.0
229  2020-09-07        0.0         0.0
230  2020-09-08      514.0         7.0

[231 rows x 3 columns]
           Date Province_State     FIPS  ...  Deaths  New cases  New deaths
0    2020-01-22          Texas  48201.0  ...       0        NaN         NaN
1    2020-01-23          Texas  48201.0  ...       0        0.0         0.0
2    2020-01-24          Texas  48201.0  ...       0        0.0         0.0
3    2020-01-25          Texas  48201.0  ...       0        0.0         0.0
4    2020-01-26          Texas  48201.0  ...       0        0.0         0.0
..          ...            ...      ...  ...     ...

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
0,2020-01-22,Texas,48201.0,0,0,0,0
1,2020-01-23,Texas,48201.0,0,0,0,0
2,2020-01-24,Texas,48201.0,0,0,0,0
3,2020-01-25,Texas,48201.0,0,0,0,0
4,2020-01-26,Texas,48201.0,0,0,0,0
...,...,...,...,...,...,...,...
226,2020-09-04,Texas,48201.0,109834,2300,1015,19
227,2020-09-05,Texas,48201.0,110762,2327,928,27
228,2020-09-06,Texas,48201.0,111525,2335,763,8
229,2020-09-07,Texas,48201.0,111525,2335,0,0


Below code executes for all FIPS, estimated runtime **10 minutes**

In [57]:
df = pd.DataFrame(columns=['Date','Province_State','FIPS','Confirmed','Deaths','New cases','New deaths'])

for fips in tqdm(full_grouped.FIPS.unique()):
  

  countyData = full_grouped[full_grouped['FIPS'] == fips]
  countyData = countyData.sort_values(by=['Date'])
  #print(fips,' , ',countyData.Confirmed.min())
  
  if countyData.Confirmed.max() > 0:
    temp = countyData.groupby(['Date'])['Confirmed', 'Deaths']
    temp = temp.sum().diff().reset_index()
    #print(temp)
    mask = temp['Date'] != temp['Date'].shift(1)

    # temp.loc[mask, 'Confirmed'] = np.nan
    # temp.loc[mask, 'Deaths'] = np.nan
    temp.columns = ['Date', 'New cases', 'New deaths']
    countyData = pd.merge(countyData, temp, on=[ 'Date'])
    #print(countyData)
    #print('********')
    # filling na with 0
    countyData = countyData.fillna(0)
    # fixing data types
    cols = ['New cases', 'New deaths']
    countyData[cols] = countyData[cols].astype('int')
    df = df.append(countyData)
    #break

100%|██████████| 3324/3324 [06:02<00:00,  9.16it/s]


In [58]:
print(df.shape)
df.tail(30)

(750750, 7)


Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
201,2020-08-10,Wyoming,90056.0,0,27,0,0
202,2020-08-11,Wyoming,90056.0,0,28,0,1
203,2020-08-12,Wyoming,90056.0,0,28,0,0
204,2020-08-13,Wyoming,90056.0,0,29,0,1
205,2020-08-14,Wyoming,90056.0,0,29,0,0
206,2020-08-15,Wyoming,90056.0,0,29,0,0
207,2020-08-16,Wyoming,90056.0,0,29,0,0
208,2020-08-17,Wyoming,90056.0,0,29,0,0
209,2020-08-18,Wyoming,90056.0,0,29,0,0
210,2020-08-19,Wyoming,90056.0,0,33,0,4


In [59]:
df.shape

(750750, 7)

In [61]:
# Harris county  fips code - 48201
df[df['FIPS'] == 48201.0].tail(50)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
181,2020-07-21,Texas,48201.0,58457,1264,1386,29
182,2020-07-22,Texas,48201.0,59901,1295,1444,31
183,2020-07-23,Texas,48201.0,61411,1323,1510,28
184,2020-07-24,Texas,48201.0,62612,1334,1201,11
185,2020-07-25,Texas,48201.0,64108,1352,1496,18
186,2020-07-26,Texas,48201.0,65344,1362,1236,10
187,2020-07-27,Texas,48201.0,66190,1376,846,14
188,2020-07-28,Texas,48201.0,67655,1394,1465,18
189,2020-07-29,Texas,48201.0,69123,1400,1468,6
190,2020-07-30,Texas,48201.0,70847,1404,1724,4


In [62]:
# merging new values
full_grouped = df.copy()

In [63]:
full_grouped.tail(5)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
226,2020-09-04,Wyoming,90056.0,0,0,0,0
227,2020-09-05,Wyoming,90056.0,0,0,0,0
228,2020-09-06,Wyoming,90056.0,0,0,0,0
229,2020-09-07,Wyoming,90056.0,0,0,0,0
230,2020-09-08,Wyoming,90056.0,0,0,0,0


In [64]:
state= ['Texas']#['California','New York']
ca_df = full_grouped[full_grouped['Province_State'].isin(state)]
ca_df.tail(10)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
221,2020-08-30,Texas,90048.0,0,0,0,0
222,2020-08-31,Texas,90048.0,0,0,0,0
223,2020-09-01,Texas,90048.0,0,0,0,0
224,2020-09-02,Texas,90048.0,0,0,0,0
225,2020-09-03,Texas,90048.0,0,0,0,0
226,2020-09-04,Texas,90048.0,0,0,0,0
227,2020-09-05,Texas,90048.0,0,0,0,0
228,2020-09-06,Texas,90048.0,0,0,0,0
229,2020-09-07,Texas,90048.0,0,0,0,0
230,2020-09-08,Texas,90048.0,0,0,0,0


In [65]:
county_population_US.head(2)

Unnamed: 0,countyFIPS,County Name,State,population
0,0,Statewide Unallocated,AL,0
1,1001,Autauga County,AL,55869


In [66]:
county_population_US['FIPS'] = county_population_US['countyFIPS']

In [67]:
ca_df.shape

(58212, 7)

# Merge with FIPS 

In [68]:
merged = pd.merge(ca_df,county_population_US,how='inner' ,on=['FIPS'])
print(merged.shape)

(57981, 11)


In [69]:
merged.tail(2)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths,countyFIPS,County Name,State,population
57979,2020-09-07,Texas,48507.0,266,13,0,0,48507,Zavala County,TX,11840
57980,2020-09-08,Texas,48507.0,264,13,-2,0,48507,Zavala County,TX,11840


## Visualization

In [70]:
full_grouped.head(3)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
0,2020-01-22,Alabama,1001.0,0,0,0,0
1,2020-01-23,Alabama,1001.0,0,0,0,0
2,2020-01-24,Alabama,1001.0,0,0,0,0


In [71]:
merged.head(3)


Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths,countyFIPS,County Name,State,population
0,2020-01-22,Texas,48001.0,0,0,0,0,48001,Anderson County,TX,57735
1,2020-01-23,Texas,48001.0,0,0,0,0,48001,Anderson County,TX,57735
2,2020-01-24,Texas,48001.0,0,0,0,0,48001,Anderson County,TX,57735


In [72]:
import pandas as pd
import altair as alt
#full_grouped = merged
ca_df = full_grouped[full_grouped['Province_State'] == 'California']
ny_df = full_grouped[full_grouped['Province_State'] == 'New York']
tx_df = full_grouped[full_grouped['Province_State'] == 'Texas']

tx_df.shape

(58212, 7)

In [73]:
ca_total = (
    ca_df
    .pipe(lambda x: x.assign(gains_pctg=x["New cases"]))
    .groupby(['Date','Province_State'])
    .agg({"gains_pctg": "sum"})
    .reset_index()
    .rename(columns={"gains_pctg": "New cases"})
)
ny_total = (
    ny_df
    .pipe(lambda x: x.assign(gains_pctg=x["New cases"]))
    .groupby(['Date','Province_State'])
    .agg({"gains_pctg": "sum"})
    .reset_index()
    .rename(columns={"gains_pctg": "New cases"})
)

tx_total = (
    tx_df
    .pipe(lambda x: x.assign(gains_pctg=x["New cases"]))
    .groupby(['Date','Province_State'])
    .agg({"gains_pctg": "sum"})
    .reset_index()
    .rename(columns={"gains_pctg": "New cases"})
)



In [74]:
base_ca = alt.Chart(ca_total).mark_bar().encode(
    x='monthdate(Date):O',
).properties(
    width=500
)

base_ny = alt.Chart(ny_total).mark_bar().encode(
    x='monthdate(Date):O',
).properties(
    width=500
)

base_tx = alt.Chart(tx_total).mark_bar().encode(
    x='monthdate(Date):O',
).properties(
    width=500
)



In [78]:
red = alt.value("#f54242")
##Ca data
#base_ca.encode(y='Confirmed').properties(title='Total Confirmed') | base_ca.encode(y='Deaths',color = red).properties(title='Total deaths')
base_tx.encode(y='New cases').properties(title='Texas State- New cases')

In [77]:
red = alt.value("#f54242")
##Ca data
#base_ca.encode(y='Confirmed').properties(title='Total Confirmed') | base_ca.encode(y='Deaths',color = red).properties(title='Total deaths')
base_ca.encode(y='New cases').properties(title='CA State- New cases')

In [80]:
#base_ny.encode(y='Confirmed').properties(title='Total Confirmed') | base_ny.encode(y='Deaths',color = red).properties(title='Total deaths')

base_ny.encode(y='New cases').properties(title='NY state - New cases')


# Export data to csv

In [81]:
ca_df = full_grouped[full_grouped['Province_State'] == 'California']
ny_df = full_grouped[full_grouped['Province_State'] == 'New York']
tx_df = full_grouped[full_grouped['Province_State'] == 'Texas']

In [89]:
ny_df.head(2)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
0,2020-01-22,New York,36001.0,0,0,0,0
1,2020-01-23,New York,36001.0,0,0,0,0


In [88]:
tx_df.tail(2)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
229,2020-09-07,Texas,90048.0,0,0,0,0
230,2020-09-08,Texas,90048.0,0,0,0,0


In [91]:
from datetime import datetime
filename = 'TX-Covid_'+datetime.now().strftime("%b%d")+'.csv' #%Y%m%d
print(filename)
tx_df.to_csv(filename,index=False)



TX-Covid_Sep10


# Compute rolling Average for new cases 

In [84]:
tx_total.head()

Unnamed: 0,Date,Province_State,New cases
0,2020-01-22,Texas,0
1,2020-01-23,Texas,0
2,2020-01-24,Texas,0
3,2020-01-25,Texas,0
4,2020-01-26,Texas,0


In [85]:

tx_total['rolling_average'] = tx_total.iloc[:,2].rolling(window=7).mean()

base_tx_total = alt.Chart(tx_total).mark_bar().encode(
    x='monthdate(Date):O',
).properties(
    width=500
)

In [86]:
base_tx_total.encode(y='rolling_average').properties(title='Texas state - rolling_average')


## Reference

* https://towardsdatascience.com/covid-19-data-processing-58aaa3663f6