florida_covid_july_16_2020_male_deaths.txt

In [33]:
### Load the Drive helper and mount
from google.colab import drive

### This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
import json 
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt

In [35]:
### On google colab side pane go to the Folder icon, scroll down to to correct file right click file and select copy path 
with open('/content/drive/My Drive/florida_covid_july_16_2020_male_deaths.txt', 'r') as f:
      covid_data_list = json.load(f)

In [36]:
### Moves DATE column into three columns
data = pd.json_normalize(covid_data_list)

### Dataframe for male deaths
df_mald = pd.DataFrame(data)

### column headers
df_mald.columns

Index(['CASE', 'COUNTY', 'AGE', 'GENDER', 'DATE.MONTH', 'DATE.DAY',
       'DATE.YEAR', 'DATE.WEEKDAY'],
      dtype='object')

In [37]:
df_mald.shape

(2542, 8)

In [38]:
df_mald.head(3)

Unnamed: 0,CASE,COUNTY,AGE,GENDER,DATE.MONTH,DATE.DAY,DATE.YEAR,DATE.WEEKDAY
0,3,Alachua,87,Male,4,9,2020,Thursday
1,4,Alachua,76,Male,4,9,2020,Thursday
2,5,Alachua,77,Male,4,18,2020,Saturday


In [39]:
### Counts for the individual months
df_mald["DATE.MONTH"].value_counts()

04    902
06    592
05    532
07    259
03    257
Name: DATE.MONTH, dtype: int64

In [40]:
### Counts for the Weekdays
df_mald["DATE.WEEKDAY"].value_counts()

Wednesday    422
Thursday     406
Friday       387
Saturday     356
Monday       352
Sunday       313
Tuesday      306
Name: DATE.WEEKDAY, dtype: int64

In [41]:
### Weekdays Find/Replace encoding
cleanup_days = {"DATE.WEEKDAY":     {"Sunday": 1, "Monday": 2, "Tuesday":3, "Wednesday":4, "Thursday":5, "Friday":6, "Saturday":7}}
df_mald.replace(cleanup_days, inplace=True)

df_mald.head(3)                               

Unnamed: 0,CASE,COUNTY,AGE,GENDER,DATE.MONTH,DATE.DAY,DATE.YEAR,DATE.WEEKDAY
0,3,Alachua,87,Male,4,9,2020,5
1,4,Alachua,76,Male,4,9,2020,5
2,5,Alachua,77,Male,4,18,2020,7


In [42]:
### drop year since all values are 2020
df_mald = df_mald.drop(['DATE.YEAR'], axis = 1)

In [43]:
### change to numerics
df_mald['DATE.DAY'] = df_mald['DATE.DAY'].astype(int)
df_mald['DATE.MONTH'] = df_mald['DATE.MONTH'].astype(int)

In [44]:
df_mald.head(3)

Unnamed: 0,CASE,COUNTY,AGE,GENDER,DATE.MONTH,DATE.DAY,DATE.WEEKDAY
0,3,Alachua,87,Male,4,9,5
1,4,Alachua,76,Male,4,9,5
2,5,Alachua,77,Male,4,18,7


In [45]:
### change to numeric
df_mald['AGE'] = df_mald['AGE'].astype(int)

In [46]:
### data types check
df_mald.dtypes

CASE            object
COUNTY          object
AGE              int64
GENDER          object
DATE.MONTH       int64
DATE.DAY         int64
DATE.WEEKDAY     int64
dtype: object

In [47]:
### Counts for the Counties
df_mald["COUNTY"].value_counts()

Dade            696
Palm Beach      360
Broward         266
Pinellas        142
Hillsborough    107
Lee             107
Manatee          76
Polk             68
Orange           62
Sarasota         61
Collier          60
Duval            50
Volusia          38
Charlotte        33
St. Lucie        31
Martin           25
Escambia         25
Brevard          24
Pasco            21
Clay             21
Osceola          21
Hendry           17
Lake             16
Marion           14
Seminole         14
Suwannee         13
Sumter           12
Indian River     12
Desoto           11
St. Johns        10
Citrus           10
Santa Rosa       10
Highlands         9
Washington        8
Hernando          8
Walton            7
Bay               7
Gadsden           6
Alachua           6
Putnam            5
Leon              5
Monroe            5
Okaloosa          5
Dixie             4
Flagler           4
Calhoun           4
Jackson           4
Madison           3
Columbia          3
Hardee            3


In [48]:
### check for how many unique counties
df_mald["COUNTY"].nunique()

60

In [49]:
### County Find/Replace encoding
cleanup_county = {"COUNTY": {"Alachua": 1, "Baker": 2, "Bay": 3, "Bradford": 4, "Brevard": 5, "Broward": 6, 
                             "Calhoun": 7, "Charlotte": 8, "Citrus": 9, "Clay": 10, "Collier": 11, "Columbia": 12,
														 "Dade": 13, "Desoto": 14, "Dixie": 15, "Duval": 16, "Escambia": 17, "Flagler": 18, 
														 "Franklin": 19, "Gadsden": 20, "Gilchrist": 21, "Glades": 22, "Gulf": 23, 
														 "Hamilton": 24, "Hardee": 25, "Hendry": 26, "Hernando": 27, "Highlands": 28, 
														 "Hillsborough": 29, "Holmes": 30, "Indian River":	31, "Jackson": 32, "Jefferson":	33, 
														 "Lafayette": 34, "Lake": 35, "Lee": 36, "Leon": 37, "Levy": 38, "Liberty":	39, 
														 "Madison": 40, "Manatee": 41, "Marion": 42, "Martin": 43, "Monroe": 44, 
														 "Nassau": 45, "Okaloosa": 46, "Okeechobee": 47, "Orange": 48, "Osceola": 49, 
														 "Palm Beach": 50, "Pasco": 51, "Pinellas":	52, "Polk":	53, "Putnam": 54, 
														 "St. Johns": 55, "St. Lucie": 56, "Santa Rosa": 57, "Sarasota": 58, "Seminole": 59,
														 "Sumter": 60, "Suwannee": 61, "Taylor": 62, "Union": 63, "Volusia": 64, 
														 "Wakulla": 65, "Walton": 66, "Washington": 67, "Unknown": 68 }} 	 
df_mald.replace(cleanup_county, inplace=True)

df_mald.head(3)     

Unnamed: 0,CASE,COUNTY,AGE,GENDER,DATE.MONTH,DATE.DAY,DATE.WEEKDAY
0,3,1,87,Male,4,9,5
1,4,1,76,Male,4,9,5
2,5,1,77,Male,4,18,7


In [50]:
df_mald['COUNTY'] = df_mald['COUNTY'].astype(int)

In [51]:
### add columns for male deaths
df_mald['Deaths'] = 1
df_mald['Cases'] = 0

In [52]:
### add column for male to replace GENDER which will be dropped
df_mald.insert(3,'Female', 0) 

In [53]:
### add column for male to replace GENDER which will be dropped
df_mald.insert(4,'Male', 1) 

In [54]:
df_mald = df_mald.drop(['GENDER'], axis = 1)

In [55]:
df_mald = df_mald.drop(['CASE'], axis = 1)

In [56]:
df_mald.columns

Index(['COUNTY', 'AGE', 'Female', 'Male', 'DATE.MONTH', 'DATE.DAY',
       'DATE.WEEKDAY', 'Deaths', 'Cases'],
      dtype='object')

In [57]:
df_mald.head(3)

Unnamed: 0,COUNTY,AGE,Female,Male,DATE.MONTH,DATE.DAY,DATE.WEEKDAY,Deaths,Cases
0,1,87,0,1,4,9,5,1,0
1,1,76,0,1,4,9,5,1,0
2,1,77,0,1,4,18,7,1,0


In [58]:
df_mald.dtypes

COUNTY          int64
AGE             int64
Female          int64
Male            int64
DATE.MONTH      int64
DATE.DAY        int64
DATE.WEEKDAY    int64
Deaths          int64
Cases           int64
dtype: object

In [59]:
# saving the dataframe 
df_mald.to_csv('encoded_male_deaths.csv') 