# Encoding the COVID Dataset

This file contains datasets to be used in the final project. The preprocessing demonstrated at the end of this notebook show how we would begin encoding the data to prepare it for machine learning. 

In [5]:
# Add the Pandas dependency.
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [6]:
# Files to load
covid_data_to_load = "Resources/Covid19vaccinesbycounty.csv"
mask_data_to_load = "Resources/US_State_and_Territorial_Public_Mask_Mandates_by_County_by_Day.csv"
vaccine_data_to_load = "Resources/Vaccine_Hesitancy_for_COVID_County_and_local_estimates.csv"

In [7]:
# Add the dependencies.
import pandas as pd
import os

In [8]:
# Files to load
covid_data_to_load = os.path.join("Resources", "Covid19vaccinesbycounty.csv")
mask_data_to_load = os.path.join("Resources", "US_State_and_Territorial_Public_Mask_Mandates_by_County_by_Day.csv")
vaccine_data_to_load = os.path.join("Resources", "Vaccine_Hesitancy_for_COVID_County_and_local_estimates.csv")

In [9]:
# Read the covid data file and store it in a Pandas DataFrame.
# covid_data_df = pd.read_csv(covid_data_to_load)
# covid_data_df

In [10]:
# Rename column
# covid_data_df.rename(columns = {'county':'County_Name'}, inplace = True)
# covid_data_df

In [11]:
# Read the mask data file and store it in a Pandas DataFrame.
mask_data_df = pd.read_csv(mask_data_to_load)
mask_data_df

Unnamed: 0,State_Tribe_Territory,County_Name,FIPS_Code,Date,Masks_Order_Code,Face_Masks_Required_in_Public,Citations
0,Texas,Nolan,48353,2020-03-11,2,,
1,Georgia,Baker,13007,2020-03-17,2,,
2,Indiana,Fulton,18049,2022-02-14,2,,"[Ind. HB 1405 (Apr. 29, 2021)]"
3,Iowa,Greene,19073,2020-07-28,2,,"[Iowa Proc. (Jul. 24, 2020)]"
4,Kansas,Montgomery,20125,2020-03-13,2,,
...,...,...,...,...,...,...,...
2562115,U.S. Virgin Islands,St. John Island,78020,2020-07-16,1,Public mask mandate,"[USVI. Exec. Order (Jul. 1, 2020), V.I. Supple..."
2562116,U.S. Virgin Islands,St. John Island,78020,2022-01-02,1,Public mask mandate,"[V.I. Thirtieth Supp. Exec. Order (Oct. 4, 202..."
2562117,U.S. Virgin Islands,St. John Island,78020,2022-03-04,1,Public mask mandate,"[V.I. Sixteenth Supp. Exec. Order (Sept. 8, 20..."
2562118,U.S. Virgin Islands,St. John Island,78020,2022-04-01,2,No public mask mandate,"[V.I. Press Release (Mar. 14, 2022) (lifts ind..."


In [12]:
# Read the vaccine data file and store it in a Pandas DataFrame.
vaccine_data_df = pd.read_csv(vaccine_data_to_load)
vaccine_data_df

Unnamed: 0,FIPS Code,County Name,State,Estimated hesitant,Estimated hesitant or unsure,Estimated strongly hesitant,Social Vulnerability Index (SVI),SVI Category,CVAC level of concern for vaccination rollout,CVAC Level Of Concern,...,Percent Hispanic,Percent non-Hispanic American Indian/Alaska Native,Percent non-Hispanic Asian,Percent non-Hispanic Black,Percent non-Hispanic Native Hawaiian/Pacific Islander,Percent non-Hispanic White,Geographical Point,State Code,County Boundary,State Boundary
0,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.2400,0.1383,0.89,Very High Vulnerability,0.64,High Concern,...,0.0242,0.0022,0.0036,0.2697,0.0000,0.6887,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
1,1121,"Talladega County, Alabama",ALABAMA,0.1783,0.2350,0.1368,0.87,Very High Vulnerability,0.84,Very High Concern,...,0.0229,0.0043,0.0061,0.3237,0.0003,0.6263,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-86.303069 33.46316, -86.30306...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
2,1131,"Wilcox County, Alabama",ALABAMA,0.1735,0.2357,0.1337,0.93,Very High Vulnerability,0.94,Very High Concern,...,0.0053,0.0009,0.0003,0.6938,0.0000,0.2684,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-87.52534299999999 32.132773, ...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
3,1129,"Washington County, Alabama",ALABAMA,0.1735,0.2357,0.1337,0.73,High Vulnerability,0.82,Very High Concern,...,0.0146,0.0731,0.0025,0.2354,0.0000,0.6495,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-88.45317899999999 31.505388, ...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
4,1133,"Winston County, Alabama",ALABAMA,0.1805,0.2313,0.1379,0.70,High Vulnerability,0.80,High Concern,...,0.0315,0.0034,0.0016,0.0073,0.0005,0.9370,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-87.63656399999999 34.120908, ...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,55079,"Milwaukee County, Wisconsin",WISCONSIN,0.1505,0.2144,0.0880,0.81,Very High Vulnerability,0.30,Low Concern,...,0.1500,0.0047,0.0428,0.2606,0.0002,0.5124,POINT (-89.732969 44.639954),WI,"MULTIPOLYGON (((-87.994171 43.19255, -87.99331...","MULTIPOLYGON (((-92.887067 45.644148, -92.8825..."
3138,55121,"Trempealeau County, Wisconsin",WISCONSIN,0.1611,0.2126,0.0959,0.28,Low Vulnerability,0.31,Low Concern,...,0.0840,0.0034,0.0043,0.0051,0.0000,0.8953,POINT (-89.732969 44.639954),WI,"MULTIPOLYGON (((-91.529034 44.422101, -91.5292...","MULTIPOLYGON (((-92.887067 45.644148, -92.8825..."
3139,56001,"Albany County, Wyoming",WYOMING,0.1949,0.2673,0.1405,0.25,Low Vulnerability,0.63,High Concern,...,0.0953,0.0091,0.0327,0.0150,0.0003,0.8248,POINT (-107.55145 42.999627),WY,MULTIPOLYGON (((-106.071399 41.530719999999995...,"MULTIPOLYGON (((-111.044893 43.315719, -111.04..."
3140,55067,"Langlade County, Wisconsin",WISCONSIN,0.1518,0.2007,0.0906,0.35,Low Vulnerability,0.19,Very Low Concern,...,0.0197,0.0069,0.0022,0.0125,0.0002,0.9383,POINT (-89.732969 44.639954),WI,"MULTIPOLYGON (((-89.42472 45.293175, -89.42473...","MULTIPOLYGON (((-92.887067 45.644148, -92.8825..."


In [13]:
# Rename column
vaccine_data_df.rename(columns = {'FIPS Code':'FIPS_Code'}, inplace = True)
vaccine_data_df

Unnamed: 0,FIPS_Code,County Name,State,Estimated hesitant,Estimated hesitant or unsure,Estimated strongly hesitant,Social Vulnerability Index (SVI),SVI Category,CVAC level of concern for vaccination rollout,CVAC Level Of Concern,...,Percent Hispanic,Percent non-Hispanic American Indian/Alaska Native,Percent non-Hispanic Asian,Percent non-Hispanic Black,Percent non-Hispanic Native Hawaiian/Pacific Islander,Percent non-Hispanic White,Geographical Point,State Code,County Boundary,State Boundary
0,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.2400,0.1383,0.89,Very High Vulnerability,0.64,High Concern,...,0.0242,0.0022,0.0036,0.2697,0.0000,0.6887,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
1,1121,"Talladega County, Alabama",ALABAMA,0.1783,0.2350,0.1368,0.87,Very High Vulnerability,0.84,Very High Concern,...,0.0229,0.0043,0.0061,0.3237,0.0003,0.6263,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-86.303069 33.46316, -86.30306...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
2,1131,"Wilcox County, Alabama",ALABAMA,0.1735,0.2357,0.1337,0.93,Very High Vulnerability,0.94,Very High Concern,...,0.0053,0.0009,0.0003,0.6938,0.0000,0.2684,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-87.52534299999999 32.132773, ...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
3,1129,"Washington County, Alabama",ALABAMA,0.1735,0.2357,0.1337,0.73,High Vulnerability,0.82,Very High Concern,...,0.0146,0.0731,0.0025,0.2354,0.0000,0.6495,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-88.45317899999999 31.505388, ...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
4,1133,"Winston County, Alabama",ALABAMA,0.1805,0.2313,0.1379,0.70,High Vulnerability,0.80,High Concern,...,0.0315,0.0034,0.0016,0.0073,0.0005,0.9370,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-87.63656399999999 34.120908, ...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,55079,"Milwaukee County, Wisconsin",WISCONSIN,0.1505,0.2144,0.0880,0.81,Very High Vulnerability,0.30,Low Concern,...,0.1500,0.0047,0.0428,0.2606,0.0002,0.5124,POINT (-89.732969 44.639954),WI,"MULTIPOLYGON (((-87.994171 43.19255, -87.99331...","MULTIPOLYGON (((-92.887067 45.644148, -92.8825..."
3138,55121,"Trempealeau County, Wisconsin",WISCONSIN,0.1611,0.2126,0.0959,0.28,Low Vulnerability,0.31,Low Concern,...,0.0840,0.0034,0.0043,0.0051,0.0000,0.8953,POINT (-89.732969 44.639954),WI,"MULTIPOLYGON (((-91.529034 44.422101, -91.5292...","MULTIPOLYGON (((-92.887067 45.644148, -92.8825..."
3139,56001,"Albany County, Wyoming",WYOMING,0.1949,0.2673,0.1405,0.25,Low Vulnerability,0.63,High Concern,...,0.0953,0.0091,0.0327,0.0150,0.0003,0.8248,POINT (-107.55145 42.999627),WY,MULTIPOLYGON (((-106.071399 41.530719999999995...,"MULTIPOLYGON (((-111.044893 43.315719, -111.04..."
3140,55067,"Langlade County, Wisconsin",WISCONSIN,0.1518,0.2007,0.0906,0.35,Low Vulnerability,0.19,Very Low Concern,...,0.0197,0.0069,0.0022,0.0125,0.0002,0.9383,POINT (-89.732969 44.639954),WI,"MULTIPOLYGON (((-89.42472 45.293175, -89.42473...","MULTIPOLYGON (((-92.887067 45.644148, -92.8825..."


In [18]:
# Combine the mask dataframe and vaccine dataframe into a single dataset.
covid_complete_df = pd.merge(vaccine_data_df, mask_data_df, on=["FIPS_Code"])
covid_complete_df.head()

Unnamed: 0,FIPS_Code,County Name,State,Estimated hesitant,Estimated hesitant or unsure,Estimated strongly hesitant,Social Vulnerability Index (SVI),SVI Category,CVAC level of concern for vaccination rollout,CVAC Level Of Concern,...,Geographical Point,State Code,County Boundary,State Boundary,State_Tribe_Territory,County_Name,Date,Masks_Order_Code,Face_Masks_Required_in_Public,Citations
0,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,Very High Vulnerability,0.64,High Concern,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2020-03-15,2,,
1,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,Very High Vulnerability,0.64,High Concern,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2020-05-03,2,,"[Ala. Health Order (Nov. 5, 2020), Ala. Health..."
2,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,Very High Vulnerability,0.64,High Concern,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2020-11-03,1,Public mask mandate,"[Ala. Health Order (Nov. 5, 2020), Ala. Health..."
3,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,Very High Vulnerability,0.64,High Concern,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2021-10-27,2,,"[Ala. Health Order (Safer Apart) (May 3, 2021)]"
4,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,Very High Vulnerability,0.64,High Concern,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2022-03-06,2,,"[Ala. Health Order (Safer Apart) (May 3, 2021)]"


In [19]:
# Encoding SVI Category
le = LabelEncoder()
SVI_category_encoded = covid_complete_df
SVI_category_encoded['SVI Category'] = le.fit_transform(SVI_category_encoded['SVI Category'])
SVI_category_encoded.head()

Unnamed: 0,FIPS_Code,County Name,State,Estimated hesitant,Estimated hesitant or unsure,Estimated strongly hesitant,Social Vulnerability Index (SVI),SVI Category,CVAC level of concern for vaccination rollout,CVAC Level Of Concern,...,Geographical Point,State Code,County Boundary,State Boundary,State_Tribe_Territory,County_Name,Date,Masks_Order_Code,Face_Masks_Required_in_Public,Citations
0,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,3,0.64,High Concern,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2020-03-15,2,,
1,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,3,0.64,High Concern,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2020-05-03,2,,"[Ala. Health Order (Nov. 5, 2020), Ala. Health..."
2,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,3,0.64,High Concern,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2020-11-03,1,Public mask mandate,"[Ala. Health Order (Nov. 5, 2020), Ala. Health..."
3,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,3,0.64,High Concern,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2021-10-27,2,,"[Ala. Health Order (Safer Apart) (May 3, 2021)]"
4,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,3,0.64,High Concern,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2022-03-06,2,,"[Ala. Health Order (Safer Apart) (May 3, 2021)]"


In [22]:
# Encoding CVAC Level of Concern
le = LabelEncoder()
CVAC_encoded = SVI_category_encoded
CVAC_encoded['CVAC Level Of Concern'] = le.fit_transform(CVAC_encoded['CVAC Level Of Concern'])
CVAC_encoded.head()

Unnamed: 0,FIPS_Code,County Name,State,Estimated hesitant,Estimated hesitant or unsure,Estimated strongly hesitant,Social Vulnerability Index (SVI),SVI Category,CVAC level of concern for vaccination rollout,CVAC Level Of Concern,...,Geographical Point,State Code,County Boundary,State Boundary,State_Tribe_Territory,County_Name,Date,Masks_Order_Code,Face_Masks_Required_in_Public,Citations
0,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,3,0.64,0,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2020-03-15,2,,
1,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,3,0.64,0,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2020-05-03,2,,"[Ala. Health Order (Nov. 5, 2020), Ala. Health..."
2,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,3,0.64,0,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2020-11-03,1,Public mask mandate,"[Ala. Health Order (Nov. 5, 2020), Ala. Health..."
3,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,3,0.64,0,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2021-10-27,2,,"[Ala. Health Order (Safer Apart) (May 3, 2021)]"
4,1123,"Tallapoosa County, Alabama",ALABAMA,0.1806,0.24,0.1383,0.89,3,0.64,0,...,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352...",Alabama,Tallapoosa,2022-03-06,2,,"[Ala. Health Order (Safer Apart) (May 3, 2021)]"
