# Research Questions: 
    - How has the pandemic affected US travel?
    - How did vaccines affect virus prevelance?
    - How did vaccines affect travel?
------

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# from bs4 import BeautifulSoup
# import requests
from datetime import datetime

from   scipy.optimize import curve_fit
from   sklearn.linear_model import LinearRegression


%matplotlib inline

In [3]:
# links for jsons, with personal app token and limit set to maximum (50000 entries)
link_mobility = "https://data.bts.gov/resource/w96p-f2qv.json?level=State&$$app_token=l3yoUDmlpTGuarN4rDzQIn4Bk&$limit=50000"
link_covid = "https://data.cdc.gov/resource/9mfq-cb36.json?$$app_token=l3yoUDmlpTGuarN4rDzQIn4Bk&$limit=50000"
link_vaccine = "https://data.cdc.gov/resource/unsk-b7fc.json?$$app_token=l3yoUDmlpTGuarN4rDzQIn4Bk&$limit=50000"
link_county_covid = "https://data.cdc.gov/resource/nra9-vzzn.json?$$app_token=l3yoUDmlpTGuarN4rDzQIn4Bk&$limit=50000"
file_county_internet = "internet_data.csv"
file_fips_to_zip = "ZIP-COUNTY-FIPS_2017-06.csv"

# link_internet = "https://raw.githubusercontent.com/BroadbandNow/Open-Data/master/broadband_data_opendatachallenge.csv"



# used this page to figure out api data retrieval:
# https://dev.socrata.com/foundry/data.bts.gov/w96p-f2qv 

# reading data into dfs
og_fips_to_zip = pd.read_csv(file_fips_to_zip)
og_county_covid_df = pd.read_json(link_county_covid)
og_county_internet_df = pd.read_csv(file_county_internet,encoding='ISO-8859-1')
og_mobility_df = pd.read_json(link_mobility)
og_covid_df = pd.read_json(link_covid)
og_vaccine_df = pd.read_json(link_vaccine)

In [13]:
#resets to original data

mobility_df = og_mobility_df.copy()
covid_df = og_covid_df.copy()
vaccine_df = og_vaccine_df.copy()
fips_to_zip_df = og_fips_to_zip.copy()
county_covid_df = og_county_covid_df.copy()
county_internet_df = og_county_internet_df.copy()

In [14]:
mobility_df.head()

Unnamed: 0,level,date,state_fips,state_code,pop_stay_at_home,pop_not_stay_at_home,trips,trips_1,trips_1_3,trips_3_5,trips_5_10,trips_10_25,trips_25_50,trips_50_100,trips_100_250,trips_250_500,trips_500,row_id,week,month
0,State,2019-01-01,32,NV,753054,2272063,9139676,2999886,2181216,1039500,1378024,1108436,222837,93301,73578,24471,18427,32-00000-20190101,0,1
1,State,2019-01-01,12,FL,4524666,16709556,55258300,16036792,14469767,6658624,7977612,6604304,2113009,796740,412224,95341,93887,12-00000-20190101,0,1
2,State,2019-01-01,11,DC,241030,459278,3114055,1314825,833796,367547,344116,183744,42096,10730,8109,3476,5616,11-00000-20190101,0,1
3,State,2019-01-01,10,DE,224457,739757,2420942,584323,640348,305441,370359,314102,128454,50542,22701,2474,2198,10-00000-20190101,0,1
4,State,2019-01-01,9,CT,853894,2707851,9468961,2375021,2618933,1189575,1331356,1211252,449947,192960,56618,12031,31268,09-00000-20190101,0,1


In [15]:
# cleaning fips_to_zips_df

col_to_keep_fips_to_zip = [
    'ZIP',
    'COUNTYNAME', 
    'STATE',
    'STCOUNTYFP',
#   'CLASSFP'
    ]
fips_to_zip_df = fips_to_zip_df.loc[:,col_to_keep_fips_to_zip]

# list of state names and abbreviations

# https://www.ssa.gov/international/coc-docs/states.html for list below (non-states except d.c. removed)
names_abbs = '''ALABAMA	AL
                ALASKA	AK
                ARIZONA	AZ
                ARKANSAS	AR
                CALIFORNIA	CA
                COLORADO	CO
                CONNECTICUT	CT
                DELAWARE	DE
                DISTRICT OF COLUMBIA	DC
                FLORIDA	FL
                GEORGIA	GA
                HAWAII	HI
                IDAHO	ID
                ILLINOIS	IL
                INDIANA	IN
                IOWA	IA
                KANSAS	KS
                KENTUCKY	KY
                LOUISIANA	LA
                MAINE	ME
                MARYLAND	MD
                MASSACHUSETTS	MA
                MICHIGAN	MI
                MINNESOTA	MN
                MISSISSIPPI	MS
                MISSOURI	MO
                MONTANA	MT
                NEBRASKA	NE
                NEVADA	NV
                NEW HAMPSHIRE	NH
                NEW JERSEY	NJ
                NEW MEXICO	NM
                NEW YORK	NY
                NORTH CAROLINA	NC
                NORTH DAKOTA	ND
                OHIO	OH
                OKLAHOMA	OK
                OREGON	OR
                PENNSYLVANIA	PA
                RHODE ISLAND	RI
                SOUTH CAROLINA	SC
                SOUTH DAKOTA	SD
                TENNESSEE	TN
                TEXAS	TX
                UTAH	UT
                VERMONT	VT
                VIRGINIA	VA
                WASHINGTON	WA
                WEST VIRGINIA	WV
                WISCONSIN	WI
                WYOMING	WY'''

names_abbs = [i.strip().split('\t') for i in names_abbs.split('\n')]
names_abbs = {i[0]:i[1] for i in names_abbs}

## Cleaning the COVID data

In [16]:
covid_df.rename(columns = {'submission_date':'date'},inplace=True)    # change submission_date to date

# drop irrelevant columns
col_to_keep_covid = ['date', 
                     'state', 
                     'tot_cases',
                     'new_case',
                     'tot_death', 
                     'new_death'
                    ]
covid_df = covid_df.loc[:, col_to_keep_covid]

new_dates = covid_df.date.str.replace('T',' ')    # correct date values in covid_df

covid_df.date = new_dates    # set date column to correct dates

covid_df.date = pd.to_datetime(covid_df.date)    # change date values to datetime format

covid_df.set_index(['date'],inplace=True)    # set index to date

# remove invalid states
good_states = covid_df.state.isin(names_abbs.values())
covid_df = covid_df.loc[good_states]

# Cleaning vaccine data

In [17]:
vaccine_df.rename(columns = {'location':'state'},inplace=True)    # change location to state

# drop irrelevant columns
col_to_keep_vaccines = ['date',
                        'state',
                        'administered',
#                         'administered_12plus',
#                         'administered_18plus',
#                         'administered_65plus', 
                        'series_complete_yes', 
#                         'series_complete_12plus', 
#                         'series_complete_18plus',
#                         'series_complete_65plus'
                       ]
vaccine_df = vaccine_df.loc[:,col_to_keep_vaccines]

vaccine_df.set_index(['date'], inplace=True)    # Set index to date

# remove invalid states
good_states = vaccine_df.state.isin(names_abbs.values())
vaccine_df = vaccine_df.loc[good_states]

# show df preview and shape
print(vaccine_df.shape)
vaccine_df.head()

(17289, 3)


Unnamed: 0_level_0,state,administered,series_complete_yes
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-11-17,RI,1664276,760979
2021-11-17,OR,5957705,2679580
2021-11-17,NE,2489596,1101301
2021-11-17,WY,609612,260369
2021-11-17,WV,1686624,743196


# Cleaning mobility data

In [59]:
mobility_df.rename(columns = {'state_code':'state'}, inplace=True)    # change state_code to state

# drop irrelevant columns
col_to_keep_mobility = ['date',
                        'state',
                        'pop_stay_at_home',
                        'pop_not_stay_at_home',
                        'trips',
                        'trips_1',
                        'trips_1_3',
                        'trips_3_5',
                        'trips_5_10',
                        'trips_10_25',
                        'trips_25_50',
                        'trips_50_100',
                        'trips_100_250',
                        'trips_250_500',
                        'trips_500'
                       ]


# distance_bins = [
#     ['trips'],
#     ['trips_1', 'trips_1_3', 'trips_3_5', 'trips_5_10','trips_10_25','trips_25_50'],
#     ['trips_50_100', 'trips_100_250', 'trips_250_500'],
#     ['trips_500']
# ]

mobility_df = mobility_df.loc[:,col_to_keep_mobility]

mobility_df.set_index(['date'], inplace=True)    # Set index to date


# remove invalid states
good_states = mobility_df.state.isin(names_abbs.values())
mobility_df = mobility_df.loc[good_states]

In [69]:
mobility_df['trips_1_50'] = mobility_df['trips_1']+mobility_df['trips_1_3']+mobility_df['trips_3_5']+mobility_df['trips_5_10']+mobility_df['trips_10_25']+mobility_df['trips_25_50']
mobility_df['trips_50_500'] = mobility_df['trips_50_100']+mobility_df['trips_100_250']+mobility_df['trips_250_500']

In [71]:
mobility_df.drop(columns=['trips_1', 'trips_1_3', 'trips_3_5', 'trips_5_10','trips_10_25','trips_25_50', 'trips_50_100', 'trips_100_250', 'trips_250_500'],inplace=True)

# Merging the dataframes
The dataframs will be merged on the `state` and `date` columns.

In [72]:
# merge dfs
step1_merge = pd.merge(mobility_df,covid_df,on=['state','date'],how='outer')
merged_df1 = pd.merge(step1_merge,vaccine_df,on=['state','date'],how='outer')

# merged_df1 = merged_df1.loc[pd.notnull(merged_df1.tot_cases)].sort_index() # only rows where total cases is not NaN
# save cleaned data to a microsoft (onedrive) repository
merged_df1.to_csv('/Users/josephgreene/OneDrive/School/Cornell/Sophomore/Intro to Data Science/Project/Project_Data',index=True)

In [73]:
merged_df1.loc[merged_df1.state == 'MA'].describe()

Unnamed: 0,pop_stay_at_home,pop_not_stay_at_home,trips,trips_500,trips_1_50,trips_50_500,tot_cases,new_case,tot_death,new_death,administered,series_complete_yes
count,981.0,981.0,981.0,981.0,981.0,981.0,665.0,665.0,665.0,665.0,339.0,339.0
mean,1706289.0,5186191.0,29776330.0,58144.892966,29267940.0,450251.030581,386032.517293,1322.714286,11766.37594,28.893233,6206166.0,2781013.0
std,260935.4,254235.5,5185838.0,60896.085998,5143942.0,113518.458265,306773.590991,1498.333374,6254.362567,36.496316,3802285.0,1926282.0
min,1243357.0,4413096.0,12938540.0,8019.0,12558620.0,162178.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1496100.0,4999984.0,25207920.0,19891.0,24651590.0,368473.0,104659.0,212.0,8248.0,6.0,2212626.0,701029.5
50%,1628752.0,5260983.0,29947390.0,33119.0,29455250.0,459090.0,319270.0,965.0,11810.0,16.0,8067293.0,3711007.0
75%,1902165.0,5390215.0,34378460.0,59347.0,33895190.0,518161.0,707523.0,1775.0,17893.0,36.0,9390898.0,4510156.0
max,2489053.0,5637696.0,40581510.0,318119.0,39939260.0,809503.0,881310.0,9003.0,19214.0,203.0,11104360.0,4859946.0


In [76]:
merged_df1.corr()

Unnamed: 0,pop_stay_at_home,pop_not_stay_at_home,trips,trips_500,trips_1_50,trips_50_500,tot_cases,new_case,tot_death,new_death,administered,series_complete_yes
pop_stay_at_home,1.0,0.935733,0.876171,0.629955,0.873611,0.909043,0.591407,0.540596,0.606859,0.515202,0.66962,0.610788
pop_not_stay_at_home,0.935733,1.0,0.967183,0.701019,0.965422,0.958052,0.643193,0.49562,0.660026,0.475966,0.753928,0.69395
trips,0.876171,0.967183,1.0,0.756025,0.999954,0.913325,0.701633,0.444882,0.724477,0.415256,0.809581,0.756074
trips_500,0.629955,0.701019,0.756025,1.0,0.755859,0.666867,0.48509,0.344597,0.494061,0.335336,0.754531,0.704541
trips_1_50,0.873611,0.965422,0.999954,0.755859,1.0,0.909401,0.702076,0.443041,0.725239,0.413862,0.809717,0.756282
trips_50_500,0.909043,0.958052,0.913325,0.666867,0.909401,1.0,0.634631,0.477679,0.64452,0.433405,0.741188,0.689035
tot_cases,0.591407,0.643193,0.701633,0.48509,0.702076,0.634631,1.0,0.504062,0.971859,0.452481,0.869804,0.834121
new_case,0.540596,0.49562,0.444882,0.344597,0.443041,0.477679,0.504062,1.0,0.476101,0.690322,0.267238,0.249417
tot_death,0.606859,0.660026,0.724477,0.494061,0.725239,0.64452,0.971859,0.476101,1.0,0.441651,0.849724,0.814134
new_death,0.515202,0.475966,0.415256,0.335336,0.413862,0.433405,0.452481,0.690322,0.441651,1.0,0.20274,0.16701


In [22]:
merged_df1.head()

Unnamed: 0_level_0,state,pop_stay_at_home,pop_not_stay_at_home,trips,trips_1,trips_1_3,trips_3_5,trips_5_10,trips_10_25,trips_25_50,trips_50_100,trips_100_250,trips_250_500,trips_500,tot_cases,new_case,tot_death,new_death,administered,series_complete_yes
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-01-01,NV,753054.0,2272063.0,9139676.0,2999886.0,2181216.0,1039500.0,1378024.0,1108436.0,222837.0,93301.0,73578.0,24471.0,18427.0,,,,,,
2019-01-01,FL,4524666.0,16709556.0,55258300.0,16036792.0,14469767.0,6658624.0,7977612.0,6604304.0,2113009.0,796740.0,412224.0,95341.0,93887.0,,,,,,
2019-01-01,DC,241030.0,459278.0,3114055.0,1314825.0,833796.0,367547.0,344116.0,183744.0,42096.0,10730.0,8109.0,3476.0,5616.0,,,,,,
2019-01-01,DE,224457.0,739757.0,2420942.0,584323.0,640348.0,305441.0,370359.0,314102.0,128454.0,50542.0,22701.0,2474.0,2198.0,,,,,,
2019-01-01,CT,853894.0,2707851.0,9468961.0,2375021.0,2618933.0,1189575.0,1331356.0,1211252.0,449947.0,192960.0,56618.0,12031.0,31268.0,,,,,,


# Creating a dictionary with state abbreviations and corresponding data

In [23]:
state_data1_dict = {state:merged_df1.loc[merged_df1.state == state] for state in names_abbs.values()}
state_data1_dict['MA'].head()

Unnamed: 0_level_0,state,pop_stay_at_home,pop_not_stay_at_home,trips,trips_1,trips_1_3,trips_3_5,trips_5_10,trips_10_25,trips_25_50,trips_50_100,trips_100_250,trips_250_500,trips_500,tot_cases,new_case,tot_death,new_death,administered,series_complete_yes
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-01-01,MA,1798521.0,5082532.0,22181890.0,6078316.0,5980407.0,2680470.0,3106759.0,2817590.0,1021091.0,316642.0,104691.0,18791.0,57133.0,,,,,,
2019-01-02,MA,1408154.0,5472899.0,28125611.0,7237304.0,7219021.0,3438621.0,4122162.0,4178675.0,1413952.0,336420.0,106300.0,21239.0,51917.0,,,,,,
2019-01-03,MA,1442723.0,5438330.0,28117808.0,7265580.0,7161397.0,3444979.0,4128933.0,4217732.0,1391118.0,329791.0,102938.0,21196.0,54144.0,,,,,,
2019-01-04,MA,1440797.0,5440256.0,28630786.0,7165246.0,7391199.0,3582607.0,4284623.0,4259228.0,1404307.0,347550.0,115732.0,24146.0,56148.0,,,,,,
2019-01-05,MA,1499659.0,5381394.0,28083625.0,7070600.0,7527639.0,3584861.0,4206453.0,3890073.0,1276922.0,350908.0,109370.0,20830.0,45969.0,,,,,,


# Creating merged_df2
This dataset will contain the county-level covid and internet data

<!-- # DEAR JOE, -->

<!-- you were trying to figure out how to turn datetime into a datatype that can be used as the x in a sns.regplot. -->

In [24]:
# link_county_mobility = "https://data.bts.gov/resource/w96p-f2qv.json?level=County&$$app_token=l3yoUDmlpTGuarN4rDzQIn4Bk&$limit=50000"
# county_mobility_df = pd.read_json(link_county_mobility)
# county_mobility_df = county_mobility_df.copy().sort_values(by='date')

In [41]:
# county_internet_df = og_county_internet_df

## The fips_to_zip_df
This dataframe allows us to add zip codes to the covid county data

In [95]:
fips_to_zip_df.rename(columns={'ZIP':'Zip','STCOUNTYFP':"fips_code","STATE":"State","COUNTYNAME":"County"},inplace=True)
fips_to_zip_df['County'] = fips_to_zip_df.County.apply(lambda x: x.rstrip(' County'))
print(fips_to_zip_df.shape)
fips_to_zip_df.sample(10)

(52889, 5)


Unnamed: 0,Zip,County,State,fips_code,CLASSFP
38276,97302,Mari,OR,41047,H1
1444,86015,Coconi,AZ,4005,H1
18787,70602,Calcasieu Parish,LA,22019,H1
18309,42378,Ohi,KY,21183,H1
49459,98063,King,WA,53033,H1
25713,64474,DeKalb,MO,29063,H1
20373,21667,Ke,MD,24029,H1
28673,89705,Carson Ci,NV,32510,C7
51009,26105,Wood,WV,54107,H1
49280,98606,Clark,WA,53011,H1


In [120]:
county_internet_df['County']

0            Abbeville
1            Abbeville
2            Abbeville
3            Abbeville
4               Acadia
             ...      
32603    Yukon-Koyukuk
32604    Yukon-Koyukuk
32605    Yukon-Koyukuk
32606    Yukon-Koyukuk
32607    Yukon-Koyukuk
Name: County, Length: 32608, dtype: object

## Cleaning county-level internet data

In [112]:
col_to_keep_internet = [
    'Zip',
    'County',
    'State', 
    'AllProviderCount_2020', 
    'All25_3_2020', 
    'All100_3',
    'TestCount',
    'AverageMbps', 
    'FastestAverageMbps'
]
county_internet_df = county_internet_df.copy().loc[:,col_to_keep_internet]

# county_internet_df.set_index(['County'], inplace=True)

In [137]:
pd.merge(county_internet_df, fips_to_zip_df.loc[:,['County','fips_code']], on='County', how='inner').sort_values(by='fips_code')

# county_internet_df.sort_values(by='County')
# fips_to_zip_df[pd.notnull(fips_to_zip_df['County'])]

Unnamed: 0,Zip,County,State,AllProviderCount_2020,All25_3_2020,All100_3,TestCount,AverageMbps,FastestAverageMbps,fips_code
33367,36006,Autauga,Alabama,9.0,5.0,3.0,,,,1001
33434,36091,Autauga,Alabama,9.0,4.0,3.0,22.0,6.46,13.09,1001
33433,36091,Autauga,Alabama,9.0,4.0,3.0,22.0,6.46,13.09,1001
33432,36091,Autauga,Alabama,9.0,4.0,3.0,22.0,6.46,13.09,1001
33431,36067,Autauga,Alabama,13.0,9.0,6.0,6462.0,98.43,198.85,1001
...,...,...,...,...,...,...,...,...,...,...
644515,96913,Guam,Guam,6.0,3.0,1.0,0.0,0.00,0.00,66010
644516,96915,Guam,Guam,6.0,3.0,1.0,0.0,0.00,0.00,66010
644517,96915,Guam,Guam,6.0,3.0,1.0,0.0,0.00,0.00,66010
644519,96915,Guam,Guam,6.0,3.0,1.0,0.0,0.00,0.00,66010


In [92]:
county_covid_df.rename(columns={"state_name":"State",'county_name':'County'},inplace=True)
county_covid_df['County'] = county_covid_df.County.apply(lambda x: x.rstrip(' County'))


print(county_covid_df.shape)
county_covid_df.sort_values(by='fips_code')

(50000, 7)


Unnamed: 0,State,County,fips_code,date,cases_per_100k_7_day_count,percent_test_results_reported,community_transmission_level
35904,Alabama,Autauga,1001,2021-11-07,141.402,5.38336,high
7617,Alabama,Autauga,1001,2020-05-08,34.008,11.63435,high
27352,Alabama,Autauga,1001,2021-03-21,155.721,12.23529,high
13989,Alabama,Autauga,1001,2020-12-13,528.021,23.50260,high
32782,Alabama,Autauga,1001,2020-11-05,196.889,15.13292,high
...,...,...,...,...,...,...,...
47918,Puerto Rico,Yauco Municipi,72153,2021-07-04,suppressed,3.09278,low
3315,Puerto Rico,Yauco Municipi,72153,2021-07-30,68.503,7.35294,substantial
40154,Puerto Rico,Yauco Municipi,72153,2021-09-12,32.762,5.59006,moderate
36670,Puerto Rico,Yauco Municipi,72153,2021-10-22,suppressed,4.21053,moderate


In [159]:

# pd.to_numeric(county_covid_df.cases_per_100k_7_day_count.str.replace('suppressed','9'))
county_covid_df.cases_per_100k_7_day_count.sort_values() #(by='cases_per+100k_7_day_count')

12177    -189.916
37633       0.000
5728        0.000
37634       0.000
21552       0.000
           ...   
48552         NaN
48558         NaN
48562         NaN
48563         NaN
48564         NaN
Name: cases_per_100k_7_day_count, Length: 50000, dtype: object

In [96]:
county_covid_with_zip_df = pd.merge(county_covid_df, fips_to_zip_df.loc[:,['Zip','fips_code']],on='fips_code',how='inner')

# merging only rows w/ zip codes in county_covid_with_zip_df
# zip_covid_internet_df = 
pd.merge(county_covid_with_zip_df, county_internet_df, how='left', on='fips_code')


KeyError: 'fips_code'

## Cleaning County-level Covid data

In [50]:
# Need to make overall transmission level (get a single value for a county's transmission level across the entire pandemic)

In [None]:
county_covid_df.sort_values(by='date',inplace=True)

ma_df = county_covid_df.loc[county_covid_df['state_name'] == 'Massachusetts']

ma_df.loc[ma_df['county_name'] == 'Berkshire County'].community_transmission_level

In [None]:
ma_df

# COMMENT TO BEGIN FROM LATER
## Dear Joe,
You were last working on the second merged dataframe, looking at how the covid by county, internet, and mobility data lines up. Could check if there is covid deaths by county. Also, could find vaccine by county and do the whole proj. by county. However, county may be too narrow for covid spread?

# Exploratory Data Analysis
## Working with just Massachussetts data

In [None]:
ma_data = state_data_dict["MA"]
# ma_data = ma_data.assign(new_cases_log = ma_data.new_case.apply(lambda x: np.log([x])[0])) # creating columns with the logarithmic function of the new_cases column
# ma_data = ma_data.assign(trips_log = ma_data.trips.apply(lambda x: np.log([x])[0])) # creating columns with the logarithmic function of the trips column

In [None]:
ma_data.describe()

In [None]:
#create scatter plot and linear regression for new cases and trips 
plt.rcParams['figure.figsize'] = (12.0, 8.0)

plot = sns.scatterplot(x='trips',y='new_case',data=ma_data.loc[ma_data.index.year == 2021])
plot2 = sns.regplot(x='trips',y='new_case',data=ma_data.loc[ma_data.index.year == 2021])
plt.ylabel('New Cases');
plt.xlabel('Trips');
plt.title('Massachussetts New cases vs. Trips for 2021')
plt.show()
print('The correlation between new cases and trips in 2021, as shown in the plot, is {corr:.2f}.'.format(corr=ma_data.loc[ma_data.index.year == 2021].new_case.corr(ma_data.loc[ma_data.index.year == 2021].trips)))


In [None]:
# create scatterplot and regression line for new cases vs. fully vaccinated

sns.scatterplot(y='new_case',x='series_complete_yes',data=ma_data.loc[ma_data.index.year == 2021]);
sns.regplot(y='new_case',x='series_complete_yes',data=ma_data.loc[ma_data.index.year == 2021])
plt.ylabel('New Cases')
plt.xlabel('# persons complete vaccine series')
plt.show();
print('The correlation between new cases and new cases in 2021, as shown in the plot, is {corr:.2f}.'.format(corr=ma_data.loc[ma_data.index.year == 2021].new_case.corr(ma_data.series_complete_yes.loc[ma_data.index.year == 2021])))

In [None]:
plt.title('Massachussets 7-day rolling average of new covid cases')
plt.xlabel('Date')
plt.ylabel('New Cases')
sns.lineplot(x='date',y=ma_data.new_case.rolling(7,on=ma_data.index).mean(),data=ma_data);

plt.show();

# Data Description
-----

## Motivation

This dataset was created by Joseph Greene at Cornell University to explore the relationships between the prevalence of COVID-19, degree of travel, and vaccination progress state-by-state in the US over the past few years.

## Composition

The instances represent covid, travel, and vaccine data for every state, for every day, from January 22, 2020, to October 19, 2021.
There are 32,538 instances. There are 51 'states', this is because the District of Columbia is included. Each state has trip data for 594 or 595 days, vaccine data for 311 days, and covid data for 638 days.

The dataset does not contain all possible instances, the APIs do not allow more than 50,000 instances to be retrieved at once. 

Including the index (date), instances consist of nine values. These include the date, state abbreviation, number of trips, the total number of cases and of deaths, number of new cases and of new deaths, number of vaccines administered, and number of people who completed their vaccine series. See a preview of the data below:

In [None]:
merged_df1.head()

Each instance contains two labels, the date, and state abbreviation. The data can be grouped by these labels.

No data is missing, though it may appear that way because the timeline of vaccine distribution, travel recording, and covid cases do not line up completely.

Data from the same date are related because they all occur on the same day. Data from the same state are related because they describe the same location.

No data splits have been done as of now.

There are no known sources of error or redundant pieces of data in the cleaned dataset.

The dataset comes from open-access APIs, there is no guarantee that the data will remain available.  There are no restrictions to accessing the data, however, in order to retrieve a significant amount of instances, one must provide an app token.

- The covid data comes from the CDC through data.gov. It can be accessed at: https://data.cdc.gov/Case-Surveillance/United-States-COVID-19-Cases-and-Deaths-by-State-o/9mfq-cb36
- The vaccine data comes from the CDC through data.gov and can be found at: https://data.cdc.gov/Vaccinations/COVID-19-Vaccinations-in-the-United-States-Jurisdi/unsk-b7fc
- The Travel data comes from the Bureau of Transportation Statistics, through data.gov. It can be found at: https://data.bts.gov/Research-and-Statistics/Trips-by-Distance/w96p-f2qv

The dataset does not contain confidential or offensive data.
Though the data relates to people, it does not contain data on individuals.


# Limitations
The data is best taken relative to previous instances, in other words, comparing changes over time.

Types:
- Deaths: where are deaths recorded? for covid, might be likely they are mostly recorded in hospitals, and hospitals may be unevenly distributed, therefore there is response bias due to a lack of even surveying.
- 

In [None]:
sns.lineplot(x='date',y='new_case',data=merged_df1.loc[merged_df1.index.month == 1], hue=merged_df1.state)

In [None]:
sns.lineplot(data=ma_data.loc[ma_data.index.year == 2021])