In [1]:
from bs4 import BeautifulSoup
import requests
import os
import datetime
import pandas as pd

In [2]:
filepath = os.path.join("Resources/See Which States and Cities Have Told Residents to Stay at Home - The New York Times.html")
with open(filepath, encoding='utf-8') as file:
    html = file.read()

In [3]:
soup = BeautifulSoup(html, 'html.parser')

In [4]:

#Grabbing the states from the article.

States = soup.select('div.state-wrap > h3')
print(States[0])
len(States)

<h3>Alabama <span class="l-population">About 4.9 million people</span></h3>


47

In [5]:

#Stripping down to just the states.

states = []
for state in States:
    #print(state.contents[0])
    states.append(state.contents[0])

Alabama 
Alaska 
Arizona 
California 
Colorado 
Connecticut 
Delaware 
District of Columbia 
Florida 
Georgia 
Hawaii 
Idaho 
Illinois 
Indiana 
Kansas 
Kentucky 
Louisiana 
Maine 
Maryland 
Massachusetts 
Michigan 
Minnesota 
Mississippi 
Missouri 
Montana 
Nevada 
New Hampshire 
New Jersey 
New Mexico 
New York 
North Carolina 
Ohio 
Oklahoma
Oregon 
Pennsylvania 
Puerto Rico 
Rhode Island 
South Carolina 
Tennessee 
Texas 
Utah
Vermont 
Virginia 
Washington 
West Virginia 
Wisconsin 
Wyoming


In [6]:
'''
Grabbing the orders and dates from the article.
orders being: [stay at home, shelter in-place, etc.]
'''
Orders = soup.select('div.place-wrap > p.l-order')
print(Orders[0])

<p class="l-order">Stay at home<span class="l-date">, effective April 4 at 5 p.m.</span></p>


In [7]:

#Putting orders and dates in separate lists: [order_type, effective_date_dirty]

order_type = []
effective_date_dirty = []
for order in Orders:
    #print(order.contents[0])
    #print(order.contents[1])
    order_type.append(order.contents[0])
    effective_date_dirty.append(order.contents[1])

Stay at home
<span class="l-date">, effective April 4 at 5 p.m.</span>
Stay at home
<span class="l-date">, effective March 28 at 5 p.m.</span>
Stay at home
<span class="l-date">, effective March 31 at 5 p.m.</span>
Stay at home
<span class="l-date">, effective March 19</span>
Stay at home
<span class="l-date">, effective March 26 at 6 a.m.</span>
Stay at home
<span class="l-date">, effective March 23 at 8 p.m.</span>
Shelter in place
<span class="l-date">, effective March 24 at 8 a.m.</span>
Stay at home
<span class="l-date">, effective April 1 at 12:01 a.m.</span>
Stay at home
<span class="l-date">, effective April 3 at 12:01 a.m.</span>
Shelter in place
<span class="l-date">, effective April 3</span>
Stay at home
<span class="l-date">, effective March 25 at 12:01 a.m.</span>
Stay at home
<span class="l-date">, effective March 25 at 1:30 p.m.</span>
Stay at home
<span class="l-date">, effective March 21 at 5 p.m.</span>
Stay at home
<span class="l-date">, effective March 24 at 11:59 p

In [8]:
#Up next is a whole lot of trying to get the dates in the right format to store.

effective_date_dirty[0:10]

[<span class="l-date">, effective April 4 at 5 p.m.</span>,
 <span class="l-date">, effective March 28 at 5 p.m.</span>,
 <span class="l-date">, effective March 31 at 5 p.m.</span>,
 <span class="l-date">, effective March 19</span>,
 <span class="l-date">, effective March 26 at 6 a.m.</span>,
 <span class="l-date">, effective March 23 at 8 p.m.</span>,
 <span class="l-date">, effective March 24 at 8 a.m.</span>,
 <span class="l-date">, effective April 1 at 12:01 a.m.</span>,
 <span class="l-date">, effective April 3 at 12:01 a.m.</span>,
 <span class="l-date">, effective April 3</span>]

In [9]:
'''
Started by making a list:(kinda_dirty_dates) that removes the html tags from the entries.
From that list, I made another list:(kinda_dirty) that removes all the extra text.
--All od the kinda_dirty elements are str.
'''
kinda_dirty_dates = []

for stuff in effective_date_dirty:
    kinda_dirty_dates.append(stuff.contents[0])
    kinda_dirty = [kinda.split(', effective ', 1)[1] for kinda in kinda_dirty_dates]
    kinda_dirty = [kinda.split(' at',2)[0] for kinda in kinda_dirty]

In [37]:
kinda_dirty_dates[0:10]

[', effective April 4 at 5 p.m.',
 ', effective March 28 at 5 p.m.',
 ', effective March 31 at 5 p.m.',
 ', effective March 19',
 ', effective March 26 at 6 a.m.',
 ', effective March 23 at 8 p.m.',
 ', effective March 24 at 8 a.m.',
 ', effective April 1 at 12:01 a.m.',
 ', effective April 3 at 12:01 a.m.',
 ', effective April 3']

In [38]:
kinda_dirty[0:10]

['April 4, 2020',
 'March 28, 2020',
 'March 31, 2020',
 'March 19, 2020',
 'March 26, 2020',
 'March 23, 2020',
 'March 24, 2020',
 'April 1, 2020',
 'April 3, 2020',
 'April 3, 2020']

In [12]:

#Just adding the year to each date in kinda_dirty. 

year = ', 2020'
kinda_dirty = [dirt + year for dirt in kinda_dirty]

In [13]:
#Creating the list:(clean_dates) that stores all the dates as datetime objects.

clean_dates = []
for dates in kinda_dirty:
    datetime_object = datetime.datetime.strptime(dates,'%B %d, %Y')
    clean_dates.append(datetime_object)

In [14]:
clean_dates[0:10]

In [15]:
'''
Now that I have lists of the states, dates, and quarantine order type
I can check to see if they'll fit nicely into a dictionary.
'''
print(f'''List Counts
States: {len(states)}
Clean Dates: {len(clean_dates)}
Order Type: {len(order_type)}''')

List Counts
States: 47
Clean Dates: 57
Order Type: 57


In [16]:
'''
Since I have 47 states vs. 57 dates and orders, there's obviously a problem.
Each state has a different statewide quarantine date except Oklahoma, Utah, and Wyoming.
These states have had different counties, towns, and cities issue their orders at different times.
Using the webpage as a reference I locate the outlying dates and assign the latest date for each state
    as that state's closing date.
    The logic here being you're only as strong as your weakest link in terms of positive covid-case count.
'''
print(f'''Index by State
Oklahoma: {states.index("Oklahoma")}
Utah: {states.index("Utah")}
Wyoming: {states.index("Wyoming")}''')

Index by State
Oklahoma: 32
Utah: 40
Wyoming: 46


In [17]:
Oklahoma_date = max(clean_dates[32:41]) #Nine counties in Oklahoma. 

In [18]:
#clean_dates[32:41]  #########Above and below i'm just grabbing the latest date of quarantine start 
                        ######for each state

In [19]:
Utah_date = max(clean_dates[48:51]) #Just three in Utah

In [20]:
#clean_dates[48:51]
unwanted = range(48, 51)
RLY_unwanted = range(32, 41)

In [21]:
for i in sorted(unwanted, reverse = True):
  del clean_dates[i]
  del order_type[i]  

In [22]:
for j in sorted(RLY_unwanted, reverse = True):
    del clean_dates[j]
    del order_type[j]

In [23]:
print(f'''{len(clean_dates)}
{len(order_type)}''')

45
45


In [24]:
'''
Now that we've removed the extra dates, we can reinsert the proper dates for Oklahoma and Utah.
(We don't have to touch Wyoming since it's the last in the list)
'''
print('')




In [25]:
clean_dates.insert(32, Oklahoma_date)

In [26]:
#Making Sure Ohio, Oklahoma, and Oregon dates match up to the webpage
clean_dates[31:34]

[datetime.datetime(2020, 3, 23, 0, 0),
 datetime.datetime(2020, 4, 6, 0, 0),
 datetime.datetime(2020, 3, 23, 0, 0)]

In [27]:
len(clean_dates)

46

In [28]:
clean_dates.insert(40, Utah_date)

In [29]:
print(f'''Dates:{len(clean_dates)}
States:{len(states)}
Orders:{len(order_type)}''')

Dates:47
States:47
Orders:45


In [30]:
#Insert utah and oklahoma order types

In [31]:
order_type.insert(32, "Varying") #this should be the same index as Oklahoma
order_type.insert(40, "Varying") #and this one Utah

In [32]:
print(f'''State    Order Type
{states[32]}    {order_type[32]}
{states[40]}        {order_type[40]}''')

State    Order Type
Oklahoma    Varying
Utah        Varying


In [33]:
print(f'''Dates:{len(clean_dates)}
States:{len(states)}
Orders:{len(order_type)}''')

Dates:47
States:47
Orders:47


In [34]:
#Now we're good to put everything into into a dataframe

In [35]:
df = pd.DataFrame(list(zip(states, clean_dates, order_type)), 
               columns =['state', 'date', 'order_type'])
df.head()

Unnamed: 0,state,date,order_type
0,Alabama,2020-04-04,Stay at home
1,Alaska,2020-03-28,Stay at home
2,Arizona,2020-03-31,Stay at home
3,California,2020-03-19,Stay at home
4,Colorado,2020-03-26,Stay at home


In [36]:
#df.to_csv('CSVs/Quarantine_Dates_By_State.csv', encoding='utf-8', index=False)