# Cleaning locations

In [17]:
import numpy as np
import pandas as pd
import warnings
import math
warnings.filterwarnings('ignore')

tweets = pd.read_csv("output_got.csv", sep=None, error_bad_lines=False, warn_bad_lines=False)

us_states= {
    'Alabama': ['AL', 'Al'],
    'Alaska': ['AK', 'Ak'],
    'Arizona': ['AZ', 'Az'],
    'Arkansas': ['AR', 'Ar'],
    'California': ['CA', 'Ca'],
    'Colorado': ['CO', 'Co'],
    'Connecticut': ['CT', 'Ct'],
    'Delaware': ['DE', 'De'],
    'District Of Columbia': ['DC', 'D.C.','Washington, DC', 'Washington, D.C.', 'Washington DC',
                            'Washington D.C.', 'WASH. D.C.'],
    'Florida': ['FL', 'Fl'],
    'Georgia': ['GA', 'Ga'],
    'Hawaii': ['HI', 'Hi'],
    'Idaho': ['ID', 'Id'],
    'Illinois': ['IL', 'Il'],
    'Indiana': ['IN'],
    'Iowa': ['IA', 'Ia'],
    'Kansas': ['KS', 'Ks'],
    'Kentucky': ['KY', 'Ky'],
    'Louisiana': ['LA', 'La'],
    'Maine': ['ME', 'Me'],
    'Maryland': ['MD', 'Md'],
    'Massachusetts': ['MA', 'Ma'],
    'Michigan': ['MI', 'Mi'],
    'Minnesota': ['MN', 'Mn'],
    'Mississippi': ['MS', 'Ms'],
    'Missouri': ['MO', 'Mo'],
    'Montana': ['MT', 'Mt'],
    'Nebraska': ['NE', 'Ne'],
    'Nevada': ['NV', 'Ne'],
    'New Hampshire': ['NH', 'Nh'],
    'New Jersey': ['NJ', 'Nj'],
    'New Mexico': ['NM', 'Nm'],
    'New York': ['NY', 'Ny'],
    'North Carolina': ['NC', 'Nc'],
    'North Dakota': ['ND', 'Nd'],
    'Ohio': ['OH', 'Oh'],
    'Oklahoma': ['OK', 'Ok'],
    'Oregon': ['OR', 'Or'],
    'Pennsylvania': ['PA', 'Pa'],
    'Rhode Island': ['RI', 'Ri'],
    'South Carolina': ['SC', 'Sc'],
    'South Dakota': ['SD', 'Sd'],
    'Tennessee': ['TN', 'Tn'],
    'Texas': ['TX', 'Tx'],
    'Utah': ['UT', 'Ut'],
    'Vermont': ['VT', 'Vt'],
    'Virginia': ['VA', 'Va'],
    'Washington': ['WA', 'Wash.', 'Wash', 'Washington State', 'Wa'],
    'West Virginia': ['WV', 'Wv'],
    'Wisconsin': ['WI', 'Wi'],
    'Wyoming': ['WY', 'Wy'],
}

locations = tweets['Geo'].tolist()

#Number of tweets from each state
state_counts = {state: 0 for state in us_states}

cleaned_locations = []

for idx, tweet in tweets.iterrows():
    location= tweet['Geo']
    if type(location) is str: #make sure its not nan
        found_state = False
        d = {'Text':tweet['Text'], 'Hashtags': tweet['Hashtags']}
        for key, values in us_states.items():
            if key.lower() in location.lower():
#                 print (location)
#                 print (key)
#                 print ()
                found_state = True
                state_counts[key] += 1
                d['Location'] = key
                cleaned_locations.append(d)
                break
            else:
                for value in values:
                    if len(value) == 2:
                        if value in location.replace(',','').split():
#                             print (location)
#                             print (value)
#                             print ()
                            found_state = True
                            state_counts[key] +=1
                            d['Location'] = key
                            cleaned_locations.append(d)
                            break
                    elif value in location:
#                         print (location)
#                         print (value)
#                         print ()
                        found_state = True
                        state_counts[key] += 1
                        d['Location'] = key
                        cleaned_locations.append(d)
                        break
                    elif len(value) > 2 and value.lower() in location.lower():
#                         print (location)
#                         print (value)
#                         print ()
                        found_state = True
                        state_counts[key] +=1
                        d['Location'] = key
                        cleaned_locations.append(d)
                        break
            if found_state == True:
                break
                
tweets = pd.DataFrame(cleaned_locations)

state_counts = pd.DataFrame(list(state_counts.items()), columns=['Name', 'Tweet Count (Normalized for State Population)'])

print("Number of tweets from US states:", len(tweets.index))

Number of tweets from US states: 4937


# Web Scraping

In [21]:
from bs4 import BeautifulSoup
from pandas import DataFrame as dataF
import requests
import time

url = "https://statusofwomendata.org/state-data/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"}
stateNames = []
employmentEarn = []
politicalPart = []
povertyOpp = []
reprodRights = []
healthWell = []
workFam = []

page = requests.get(url, headers=headers)
time.sleep(0.5)
soup = BeautifulSoup(page.text, "lxml")
statesList = soup.find("article", class_="post").find_all("p")

for state in statesList:
    statePage = requests.get(state.find("a").get("href"), headers=headers)
    time.sleep(0.5)
    stateSoup = BeautifulSoup(statePage.text, "lxml")
    name = stateSoup.find("div", class_="state").find("h1").text
    stateNames.append(name)
    
    reportCard = stateSoup.find("div", class_="cardSummary").find_all("tr")[1:]
    employmentEarn.append(reportCard[0].find_all("td")[1].text)
    if(reportCard[1].find_all("td")[1].text == "-"):
        politicalPart.append("0")
    else:
        politicalPart.append(reportCard[1].find_all("td")[1].text)
    povertyOpp.append(reportCard[2].find_all("td")[1].text)
    reprodRights.append(reportCard[3].find_all("td")[1].text)
    healthWell.append(reportCard[4].find_all("td")[1].text)
    workFam.append(reportCard[5].find_all("td")[1].text)



# Write Number of tweets per state (adjusted for population) to state_rankings.csv

In [None]:
stateRankings = dataF({"Name": stateNames, "Employment & Earnings": employmentEarn, "Political Participation": politicalPart, "Poverty & Opportunity": povertyOpp, "Reproductive Rights": reprodRights, "Health & Well-Being": healthWell, "Work & Family": workFam})
stateRankings = stateRankings[["Name","Employment & Earnings", "Political Participation", "Poverty & Opportunity","Reproductive Rights", "Health & Well-Being", "Work & Family"]]

stateRankings.loc[stateRankings['Name'] == 'District Of Columbia', 'Political Participation'] = 25

stateRankings = stateRankings.merge(state_counts, on="Name")
stateRankings.to_csv("state_rankings.csv", index=False)

populations = 
{'Alabama': 4863300,
    'Alaska': 741894,
    'Arizona': 6931071,
    'Arkansas': 2988248,
    'California': 39506094,
    'Colorado': 5632271,
    'Connecticut': 	3568174,
    'Delaware': 960054,
    'District Of Columbia': 691963,
    'Florida': 20979964,
    'Georgia': 10421344,
    'Hawaii': ['HI', 'Hi'],
    'Idaho': ['ID', 'Id'],
    'Illinois': ['IL', 'Il'],
    'Indiana': ['IN'],
    'Iowa': ['IA', 'Ia'],
    'Kansas': ['KS', 'Ks'],
    'Kentucky': ['KY', 'Ky'],
    'Louisiana': ['LA', 'La'],
    'Maine': ['ME', 'Me'],
    'Maryland': ['MD', 'Md'],
    'Massachusetts': ['MA', 'Ma'],
    'Michigan': ['MI', 'Mi'],
    'Minnesota': ['MN', 'Mn'],
    'Mississippi': ['MS', 'Ms'],
    'Missouri': ['MO', 'Mo'],
    'Montana': ['MT', 'Mt'],
    'Nebraska': ['NE', 'Ne'],
    'Nevada': ['NV', 'Ne'],
    'New Hampshire': ['NH', 'Nh'],
    'New Jersey': ['NJ', 'Nj'],
    'New Mexico': ['NM', 'Nm'],
    'New York': ['NY', 'Ny'],
    'North Carolina': ['NC', 'Nc'],
    'North Dakota': ['ND', 'Nd'],
    'Ohio': ['OH', 'Oh'],
    'Oklahoma': ['OK', 'Ok'],
    'Oregon': ['OR', 'Or'],
    'Pennsylvania': ['PA', 'Pa'],
    'Rhode Island': ['RI', 'Ri'],
    'South Carolina': ['SC', 'Sc'],
    'South Dakota': ['SD', 'Sd'],
    'Tennessee': ['TN', 'Tn'],
    'Texas': ['TX', 'Tx'],
    'Utah': ['UT', 'Ut'],
    'Vermont': ['VT', 'Vt'],
    'Virginia': ['VA', 'Va'],
    'Washington': ['WA', 'Wash.', 'Wash', 'Washington State', 'Wa'],
    'West Virginia': ['WV', 'Wv'],
    'Wisconsin': ['WI', 'Wi'],
    'Wyoming': ['WY', 'Wy'],
}

