In [1]:
# Dependencies
import tweepy
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import time
import pytz

# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz

# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process

# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Twitter API Keys
from config import (consumer_key,
                    consumer_secret,
                    access_token,
                    access_token_secret)
#Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, 
                 wait_on_rate_limit=True,
                 wait_on_rate_limit_notify= True,
                 parser=tweepy.parsers.JSONParser())
#Format times
utc=pytz.UTC
startDate = datetime(2018, 1, 1, 0, 0, 0)
endDate = datetime(2018, 4, 10, 0, 0, 0)
startDate = utc.localize(startDate) 
endDate = utc.localize(endDate) 

In [6]:
output_list = pd.read_csv('/Users/rorr/PythonStuff/Team_5/Resources/LocalTV_Tweets.csv')
output_list.head(2)

Unnamed: 0.1,Unnamed: 0,created_at,account_location,text_tweet,name
0,0,2018-04-20 04:12:45+00:00,"Shreveport, LA",RT @TimONBC6: VIDEO: Mudbugs fall short in clo...,KMSSTV
1,1,2018-04-20 03:04:30+00:00,"Shreveport, LA",RT @TimONBC6: The Corpus Christi IceRays force...,KMSSTV


In [7]:
output_list['account_location'] = output_list['account_location'].str.lower()
#output_list['place_name_long'] = output_list['place_name_long'].str.lower()
#output_list['account_location'] = np.where((output_list['place_name_long'].isnull())==True, output_list['account_location'], output_list['place_name_long'])

output_list['location'] = output_list['account_location']
output_list['city'], output_list['state/country'] = output_list['account_location'].str.split(', ', 1).str

output_list['text_tweet'] = output_list['text_tweet'].str.lower()
output_list.head(2)

Unnamed: 0.1,Unnamed: 0,created_at,account_location,text_tweet,name,location,city,state/country
0,0,2018-04-20 04:12:45+00:00,"shreveport, la",rt @timonbc6: video: mudbugs fall short in clo...,KMSSTV,"shreveport, la",shreveport,la
1,1,2018-04-20 03:04:30+00:00,"shreveport, la",rt @timonbc6: the corpus christi icerays force...,KMSSTV,"shreveport, la",shreveport,la


In [8]:
cities = pd.read_csv('/Users/rorr/PythonStuff/Team_5/Resources/uscitiesv1.4.csv')
cities['state_name'] = cities['state_name'].str.lower()
cities['city'] = cities['city'].str.lower()
cities['state_id'] = cities['state_id'].str.lower()

cities.head(3)

Unnamed: 0,city,city_ascii,state_id,state_name,county_fips,county_name,lat,lng,population,population_proper,density,source,incorporated,timezone,zips,id
0,prairie ridge,Prairie Ridge,wa,washington,53053,Pierce,47.1443,-122.1408,,,1349.8,polygon,False,America/Los_Angeles,98360 98391,1840037882
1,edison,Edison,wa,washington,53057,Skagit,48.5602,-122.4311,,,127.4,polygon,False,America/Los_Angeles,98232,1840017314
2,packwood,Packwood,wa,washington,53041,Lewis,46.6085,-121.6702,,,213.9,polygon,False,America/Los_Angeles,98361,1840025265


In [44]:
output_final=pd.merge(output_list, cities, left_on=['state/country' ,'city'], right_on=['state_name', 'city'],how='inner',indicator=True)

# if output_list['place_name_long'] is not None:
#     output_list['account_location'] = output_list['place_name_long']
#output_list[output_list.apply(lambda row: fuzz.token_sort_ratio(['account_location']), axis=1) > 80]
#output_list = output_list.drop_duplicates('text_tweet')
output_final.count()

Unnamed: 0           192
created_at           192
account_location     192
text_tweet           192
place_type             0
country_code           0
place_name_long        0
place_name_short       0
location             192
city                 192
state/country        192
city_ascii           192
state_id             192
state_name           192
county_fips          192
county_name          192
lat                  192
lng                  192
population           187
population_proper    187
density              192
source               192
incorporated         192
timezone             192
zips                 192
id                   192
_merge               192
dtype: int64

In [None]:
#output_list=pd.merge(TV,NPR, left_on=[‘Twitter Handle’], right_on=[‘Twitter Handle’],how=“outer”,indicator=True)

# save copy for easy edits
output_copy = output_list
# switch commenting out with above to easily test manipulations. 
#output_list = output_copy

output_list['tweet_text'] = output_list['tweet_text'].str.lower()

#output_list[output_list['tweet_text'].str.contains("immigrant" and 'immigration' and 'immigrants' and "foreigner" and "noncitizen" and "undocumented" and "non-citizen" and "permanent resident" and "sanctuary city", na=False)]

output_list.drop_duplicates('Twitter Handle')

print(output_list.count())

# Run Vader Analysis on each tweet
for (idx, row) in output_list.iterrows():
    text = (row.loc['text_tweet'])
    counter += 1
    results = analyzer.polarity_scores(text)
    compound = results["compound"]
    pos = results["pos"]
    neu = results["neu"]
    neg = results["neg"]
    tweet_times.append(tweet["created_at"])
    output_list.at[idx, 'compound'] = compound
    output_list.at[idx, 'pos'] = pos
    output_list.at[idx, 'neu'] = neu
    output_list.at[idx, 'neg'] = neg

In [None]:
#Read CSV and create dataframe
census_csv = "./Resources/census_immig.csv"
census_df = pd.read_csv(census_csv)

#Dataframe reduced to contain only required columns
census_df = census_df[["combined_fips", "Population"]]
#FIPS column changed to string datatype
census_df["combined_fips"] = census_df["combined_fips"].astype(str)

#Loop to keep leading zeros in FIPS codes
for (idx, row) in census_df.iterrows():
    fips_z = (row.loc['combined_fips'])
    census_df.at[idx, 'combined_fips'] = str(fips_z).zfill(5)
    
#Rename FIPS column to match with other dataframe
census_df = census_df.rename(columns={'combined_fips': 'FIPS'})

#Show dataframe
census_df.head()