In [2]:
# for natural language processing, which is used for the tweets
import nltk
from nltk import tokenize
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

# allows access to tweets with Python
import tweepy as tw
from tweepy import OAuthHandler
from tweepy import Cursor

import json

import re, string
import pickle

In [3]:
consumer_key = ''        
consumer_secret = ''
access_token = ''
access_token_secret = ''

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

# dictionary of states and their longitudes and latitudes
states = {
            "Alabama" : "32.8066,-86.7911",
            "Alaska" : "61.3707,-152.4044",
            "Arizona" : "33.7297,-111.4312",
            "Arkansas" : "34.9697,-92.3731",
            "California" : "36.1162,-119.6815",
            "Colorado" : "39.0598,-105.3111",
            "Connecticut" : "41.5977,-72.7553",
            "Delaware" : "39.3185,-75.5071",
            "District of Columbia" : "38.8974,-77.0268",
            "Florida" : "27.7664,-81.6867",
            "Georgia" : "33.0406,-83.6430",
            "Hawaii" : "21.094318,-157.498337",
            "Idaho" : "44.2404,-114.4788",
            "Illinois" : "40.3494,-88.9861",
            "Indiana" : "39.8494,-86.2582",
            "Iowa" : "42.0115,-93.2105",
            "Kansas" : "38.5266,-96.7264",
            "Kentucky" : "37.6681,-84.6700",
            "Louisiana" : "31.1695,-91.8678",
            "Maine" : "44.6939,-69.3819",
            "Maryland" : "39.0639,-76.8021",
            "Massachusetts" : "42.2301,-71.5301",
            "Michigan" : "43.3266,-84.5360",
            "Minnesota" : "45.6944,-93.9001",
            "Mississippi" : "32.7416,-89.6786",
            "Missouri" : "38.4560,-92.2883",
            "Montana" : "46.9219,-110.4543",
            "Nebraska" : "41.1253,-98.2680",
            "Nevada" : "38.3135,-117.0553",
            "New Hampshire" : "43.4524,-71.5638",
            "New Jersey" : "40.2989,-74.5210",
            "New Mexico" : "34.8405,-106.2484",
            "New York" : "42.1657,-74.9480",
            "North Carolina" : "35.6300,-79.8064",
            "North Dakota" : "47.5289,-99.7840",
            "Ohio" : "40.3887,-82.7649",
            "Oklahoma" : "35.5653,-96.9289",
            "Oregon" : "44.5720,-122.0709",
            "Pennsylvania" : "40.5907,-77.2097",
            "Rhode Island" : "41.6808,-71.5117",
            "South Carolina" : "33.8568,-80.9450",
            "South Dakota" : "44.2997,-99.4388",
            "Tennessee" : "35.7478,-86.6923",
            "Texas" : "31.0544,-97.5634",
            "Utah" : "40.1500,-111.8624",
            "Vermont" : "44.0458,-72.7106",
            "Virginia" : "37.7693,-78.1699",
            "Washington" : "47.4009,-121.4904",
            "West Virginia" : "38.4912,-80.9544",
            "Wisconsin" : "44.2684,-89.6165",
            "Wyoming" : "42.7559,-107.3024"
}
        
# removes extra symbols from tweets and reduces each word to its dictionary form
def remove_noise(tweet_tokens, stop_words = ()):
    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                           '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens
    
# returns element that appears the most in given list
# if multiple elements appear the same amount of times, first
#     element is returned
def mode(array):
    most = max(list(map(array.count, array)))
    return list(set(filter(lambda x: array.count(x) == most, array)))[0]

# analyzes tweets from given state about reopening during covid-19 
# returns whether the given state is against, neutral, or lenient 
#     regarding reopening
def state_attitude(state):
    search_term = "#covid+reopening -filter:retweets"
    classifier = pickle.load(open('tweets_text.sav', 'rb'))
    attitude = {}

    tweets = tw.Cursor(api.search,
                        q=search_term,
                        geocode=states[state] + ",200km",
                        lang="en",
                        since='2020-06-01').items(100)

    all_tweets = [tweet.text for tweet in tweets]
    
    if len(all_tweets) > 0:

        tweet_tokenizer = TweetTokenizer()
        ans = []

        for custom_tweet in all_tweets:
            custom_tokens = remove_noise(tweet_tokenizer.tokenize(custom_tweet))
            ans.append(classifier.classify(dict([token, True] for token in custom_tokens)))

        attitude[state] = mode(ans)
    else:
        attitude[state] = "Neutral on reopening"  
    return attitude

In [4]:
# example
state_attitude("Alabama")

{'Alabama': 'Lenient on reopening'}