# Import

## Library import

In [26]:
import pandas as pd
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
import glob
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

# Analysis

## Hector

### Policies
Importing, merging, and creating a column containing the state of policies.

In [2]:
policies_path = ".\\CCI_C-19_Policies\\data_tables\\policy_data\\table_data\\Current\\"
policies_files = glob.glob(policies_path + "*.csv")

content = [] #store contents from files

for filepath in policies_files:
    
    df = pd.read_csv(filepath, index_col=None)
    #Stripping the string to store the name of the file to a State column
    #State column should be the second column
    df.insert(1,"State", filepath.replace(policies_path,"").replace("_policy.csv",""))
    content.append(df)


policies_df = pd.concat(content)

In [3]:
policies_df

Unnamed: 0,date,State,policy,Restrict/Close,Opening (State),Deferring to County,Testing,Education,Health/Medical,Emergency Level,Transportation,Budget,Social Distancing,Other,Vaccine,Opening (County)
0,6-Mar-20,Alabama,Governor Ivey announced the formation of a Cor...,,,,,,,,,,,Y,,
1,10-Mar-20,Alabama,Governor Ivey sent a memo to state agency head...,,,,,,,,,,,Y,,
2,13-Mar-20,Alabama,Governor Ivey issued a state of emergency for ...,,,,,,,Y,,,,Y,,
3,14-Mar-20,Alabama,Governor Ivey closed all Alabama public school...,Y,,,,Y,,,,,,,,
4,15-Mar-20,Alabama,Governor Ivey authorized directors of all stat...,,,,,,,,,,,Y,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,29-Apr-21,Wyoming,Wyoming’s two remaining COVID-19 statewide pub...,,,,,,,Y,,,,,,
75,13-May-21,Wyoming,Governor Gordon Withdraws Wyoming from COVID-e...,,,,,,,,,Y,,,,
76,21-May-21,Wyoming,The Wyoming Department of Health (WDH) is anno...,,Y,,,Y,,,,,Y,Y,,
77,4-Aug-21,Wyoming,"Governor Will Not Issue Mask Mandate, Stresses...",,,,,,,,,,,Y,,


In [4]:
# Getting all unique values from the columns after "policy" columns in the dataframe
for col in policies_df.columns[3:]:
    print(col,": ",set(policies_df[col]))

Restrict/Close :  {nan, 'Y', ' ', 'y'}
Opening (State) :  {nan, 'Y', 'y'}
Deferring to County :  {nan, 'Y'}
Testing :  {nan, 'Y'}
Education :  {nan, 'Y'}
Health/Medical :  {nan, 'Y', 'y'}
Emergency Level :  {nan, 'Y', 'y'}
Transportation :  {nan, 'Y'}
Budget :  {nan, 'Y', 'y'}
Social Distancing :  {nan, 'Y', 'y'}
Other :  {nan, 'Y', 'y'}
Vaccine :  {nan, 'Y', 'y'}
Opening (County) :  {nan, 'Y'}


In [5]:
# Fixin some of the inconsistencies such as lower case Y seen in
# previous cell.

# Replacing only single lowercase y to Y
policies_df.replace(to_replace=r'^y$',value="Y",regex=True,inplace=True)

In [6]:
# Finding which entry has a whitespace
policies_df[policies_df['Restrict/Close'].str.contains("\s", regex=True, na=False)]

Unnamed: 0,date,State,policy,Restrict/Close,Opening (State),Deferring to County,Testing,Education,Health/Medical,Emergency Level,Transportation,Budget,Social Distancing,Other,Vaccine,Opening (County)
25,11-Apr-20,Kansas,The Kansas Supreme Court upheld the Governor'...,,,,,,,,,,,Y,,


In [7]:
# Reading the policy from entry
list(policies_df["policy"][policies_df['Restrict/Close'].str.contains("\s", regex=True, na=False)])

[" The Kansas Supreme Court upheld the Governor's executive order limiting the size of mass gathering to 10 people, including that of church services and funerals. "]

Policy doesnt provide enough info so will just fill with NaN in the meantime.

In [8]:
policies_df['Restrict/Close'].replace("\s", np.nan ,inplace=True,regex=True)

In [9]:
# Re-verifying all unique values
for col in policies_df.columns[3:]:
    print(col,": ",set(policies_df[col]))

Restrict/Close :  {nan, 'Y'}
Opening (State) :  {nan, 'Y'}
Deferring to County :  {nan, 'Y'}
Testing :  {nan, 'Y'}
Education :  {nan, 'Y'}
Health/Medical :  {nan, 'Y'}
Emergency Level :  {nan, 'Y'}
Transportation :  {nan, 'Y'}
Budget :  {nan, 'Y'}
Social Distancing :  {nan, 'Y'}
Other :  {nan, 'Y'}
Vaccine :  {nan, 'Y'}
Opening (County) :  {nan, 'Y'}


In [10]:
# Dropping, if any, duplicates that are contained from the first 3 cols
policies_df.drop_duplicates(["date","State","policy"], inplace=True)

In [11]:
# Checking dups of only first 2 cols
policies_df[policies_df.duplicated(["date","State"], keep= False)]

Unnamed: 0,date,State,policy,Restrict/Close,Opening (State),Deferring to County,Testing,Education,Health/Medical,Emergency Level,Transportation,Budget,Social Distancing,Other,Vaccine,Opening (County)
4,15-Mar-20,Alabama,Governor Ivey authorized directors of all stat...,,,,,,,,,,,Y,,
5,15-Mar-20,Alabama,The Governor authorized directors of all state...,,,,,,,,,,Y,Y,,
7,17-Mar-20,Alabama,The Governor emphasized measures taken by fina...,,,,,,,,,,,Y,,
8,17-Mar-20,Alabama,The Alabama Department of Public Health issued...,Y,,,,,,,,,,,,
11,20-Mar-20,Alabama,The Governor issued an updated statewide publi...,Y,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,1-Jul-20,Wyoming,The Wyoming Superintendent announced the relea...,,,,,,,,,,,Y,,
54,19-Nov-20,Wyoming,Governor Mark Gordon has authorized additional...,,,,,,Y,,,,,,,
55,19-Nov-20,Wyoming,Governor Mark Gordon announced additional stat...,Y,,,,,,,,,,,,
57,11-Dec-20,Wyoming,Governor Mark Gordon announced the opening of ...,,,,,,,,,,,Y,,


In [12]:
list(policies_df.columns[2:])

['policy',
 'Restrict/Close',
 'Opening (State)',
 'Deferring to County',
 'Testing',
 'Education',
 'Health/Medical',
 'Emergency Level',
 'Transportation',
 'Budget',
 'Social Distancing',
 'Other',
 'Vaccine',
 'Opening (County)']

There seem to be multiple entries per date,state. Maybe merging them could be a good idea.

In [13]:
policies_df.dtypes

date                   object
State                  object
policy                 object
Restrict/Close         object
Opening (State)        object
Deferring to County    object
Testing                object
Education              object
Health/Medical         object
Emergency Level        object
Transportation         object
Budget                 object
Social Distancing      object
Other                  object
Vaccine                object
Opening (County)       object
dtype: object

In [14]:
# Combining data between the date and state columns
# Inspired by
# https://stackoverflow.com/questions/14529838/apply-multiple-functions-to-multiple-groupby-columns

cols = list(policies_df.columns[2:])
dic = {}

for col in cols:
    # Changin datatype to string to avoid issues with nans
    policies_df = policies_df.astype({col: str})
    # Property of joining with commas for the agg
    dic[col] = ",".join

policies_df = policies_df.groupby(["date","State"]).agg(dic).reset_index()


In [15]:
policies_df

Unnamed: 0,date,State,policy,Restrict/Close,Opening (State),Deferring to County,Testing,Education,Health/Medical,Emergency Level,Transportation,Budget,Social Distancing,Other,Vaccine,Opening (County)
0,1-Apr-20,American Samoa,The Governor issued the second amended declara...,Y,,,,,,,,,,,,
1,1-Apr-20,Arizona,The Governor announced the contribution of $25...,"nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,Y,nan","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,nan,nan","Y,Y,Y,Y","nan,nan,nan,nan","nan,nan,nan,nan"
2,1-Apr-20,Arkansas,The Governor implemented day use operations on...,"nan,nan","nan,nan","nan,nan","nan,Y","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","Y,Y","nan,nan","nan,nan"
3,1-Apr-20,California,The Governor issued an executive order that al...,"nan,nan","nan,nan","nan,nan","nan,nan","nan,Y","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","Y,nan","nan,nan","nan,nan"
4,1-Apr-20,Colorado,The Governor announced that he would extend th...,"Y,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,Y","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,9-Sep-20,New Hampshire,Governor Chris Sununu issued Emergency Order #...,,,,,,,Y,,,,Y,,
5084,9-Sep-20,New York,Governor Cuomo announced indoor dining in New ...,,Y,,,,,,,,,,,
5085,9-Sep-20,Oklahoma,The Oklahoma State Department of Health (OSDH)...,,Y,,,,,,,,,,,
5086,9-Sep-20,South Dakota,Governor Kristi Noem laid out a framework for ...,,,,,,,,,Y,,Y,,


In [16]:
#Iterating through dataframe to
#Fix 

# for index, row in policies_df.iterrows():
    
# #     for col in list(policies_df.columns)[3:]:#only the 4th column onwards
        
# #         for val in row[col]:
# #             splt = val.split(',') #split on comma
# #             print(val)
#     for val in row:
#         print(val)
#     #         if len(splt) > 1: #only if split contains more than 1 value
#     #             if 'Y' in splt: #Filtering only ones containing Y
#     #                 policies_df[index]

#     #             else: #
                

# for index, row in policies_df.iterrows():
#     print(row["date"])

def reducto(df):
    dic = {}
    
    cols = list(df.columns)[3:] #Only the columns 4th and forward
    
    for col in cols:
        for val in df[col]:
            split = val.split(',')
            if len(split) > 1:
                if "Y" in split:
                    dic[col] = "Y"
                else:
                    dic[col] = np.nan
    return pd.Series(dic, index=cols, dtype = object)

policies_df.groupby(["date","State","policy"]).apply(reducto).reset_index()  

Unnamed: 0,date,State,policy,Restrict/Close,Opening (State),Deferring to County,Testing,Education,Health/Medical,Emergency Level,Transportation,Budget,Social Distancing,Other,Vaccine,Opening (County)
0,1-Apr-20,American Samoa,The Governor issued the second amended declara...,,,,,,,,,,,,,
1,1-Apr-20,Arizona,The Governor announced the contribution of $25...,,,,,,Y,,,,,Y,,
2,1-Apr-20,Arkansas,The Governor implemented day use operations on...,,,,Y,,,,,,,Y,,
3,1-Apr-20,California,The Governor issued an executive order that al...,,,,,Y,,,,,,Y,,
4,1-Apr-20,Colorado,The Governor announced that he would extend th...,Y,,,,,Y,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,9-Sep-20,New Hampshire,Governor Chris Sununu issued Emergency Order #...,,,,,,,,,,,,,
5084,9-Sep-20,New York,Governor Cuomo announced indoor dining in New ...,,,,,,,,,,,,,
5085,9-Sep-20,Oklahoma,The Oklahoma State Department of Health (OSDH)...,,,,,,,,,,,,,
5086,9-Sep-20,South Dakota,Governor Kristi Noem laid out a framework for ...,,,,,,,,,,,,,


In [17]:
# Counting most common words, inspired by
# https://www.houseninetytwo.com/how-to-use-python-to-extract-keywords-from-sentance-in-dataframe/
common_words = pd.Series(' '.join(policies_df['policy']).split()).value_counts()[:40]
common_words

to            8915
the           8892
and           7784
of            6481
Governor      5148
The           4067
for           3993
in            3051
announced     2780
a             2679
that          2102
COVID-19      1752
will          1581
with          1352
an            1340
on            1321
Department    1258
state         1142
Health        1124
order         1056
health        1048
issued        1030
be             885
as             860
at             837
public         803
by             768
or             747
from           742
signed         731
care           719
executive      719
Order          713
is             710
all            696
are            643
new            623
Executive      620
State          610
million        559
dtype: int64

In [18]:
uncommon_words = pd.Series(' '.join(policies_df['policy']).split()).value_counts()[-40:]
uncommon_words

Scarborough           1
H.                    1
Pfzer                 1
frst                  1
dose).                1
29th                  1
Modification,         1
Hit                   1
Hardest               1
Plus,                 1
$25M                  1
“Bridge               1
Fund”                 1
Medicare-certified    1
relatives             1
deaths.               1
can,                  1
schools,Regional      1
Northeastern          1
(i.e.                 1
fnal                  1
weeks)                1
charting              1
3411,                 1
uninterrupted         1
governs               1
designations.The      1
Memphis               1
401                   1
2018                  1
Stakeholders          1
(2                    1
K-12,                 1
officials.,The        1
zoos.                 1
210,000               1
Taiwanese             1
Corporations          1
Were                  1
Dakota’s              1
dtype: int64

In [19]:
# Convert all to lowercase for case sensitivity, as stopwords are lowercase
policies_df['policy'] = policies_df['policy'].str.lower()

In [29]:
#list of stop words (i can add more words if necesary
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [30]:
# Remove useless words using the stop words
policies_df['policy'] = policies_df['policy'].apply(lambda x: ' '.join(item for item in x.split() if item not in stop_words))
policies_df.head(50)

Unnamed: 0,date,State,policy,Restrict/Close,Opening (State),Deferring to County,Testing,Education,Health/Medical,Emergency Level,Transportation,Budget,Social Distancing,Other,Vaccine,Opening (County)
0,1-Apr-20,American Samoa,governor issued second amended declaration con...,Y,,,,,,,,,,,,
1,1-Apr-20,Arizona,"governor announced contribution $250,000 arizo...","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,Y,nan","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,nan,nan","nan,nan,nan,nan","Y,Y,Y,Y","nan,nan,nan,nan","nan,nan,nan,nan"
2,1-Apr-20,Arkansas,"governor implemented day use operations only, ...","nan,nan","nan,nan","nan,nan","nan,Y","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","Y,Y","nan,nan","nan,nan"
3,1-Apr-20,California,governor issued executive order allows immedia...,"nan,nan","nan,nan","nan,nan","nan,nan","nan,Y","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","Y,nan","nan,nan","nan,nan"
4,1-Apr-20,Colorado,governor announced would extend suspension nor...,"Y,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,Y","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan"
5,1-Apr-20,Connecticut,governor issued executive order taking followi...,"nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","nan,nan","Y,nan","Y,Y","nan,nan","nan,nan"
6,1-Apr-20,Delaware,governor signed theninth modification state em...,Y,,,,,,,,,,,,
7,1-Apr-20,Florida,governor issued statewide stay-at-home go effe...,Y,,,,,,,,,,,,
8,1-Apr-20,Georgia,governor announced thatthe georgia national gu...,"nan,Y,nan","nan,nan,nan","nan,nan,nan","nan,nan,nan","nan,nan,nan","nan,nan,nan","nan,nan,nan","nan,nan,nan","nan,nan,nan","nan,nan,nan","Y,nan,Y","nan,nan,nan","nan,nan,nan"
9,1-Apr-20,Illinois,governor signed proclamation extending stay-at...,Y,,,,,,,,,,,,


#### Policies Dictionary

No column dictionary was provided in the repository for these. Will make my own.

**The following are the primary cols**

date: date it occured

State: string of State

policy: string of actions taken by the state

**The rest are just booleans, of which are mostly either Y or nan**

Restrict/Close: Closure of unesential businesses?

Opening (State): Opened the state to foreigners?

Deferring to County: ?

Testing: Covid testing began?

Education: Physical attendance of education again?

Health/Medical: ?

Emergency Level: ?

Transportation: Public transportation??
       
Budget: ?
       
Social Distancing: Social distancing was active?

Other: ?
       
Vaccine: Vaccines were active?

Opening (County): ?

#### Will attempt keyword extraction from the dataset