In this worksheet, we only work with an amount of data that can be pulled from Twitter in a single 15 minute window.

Some of the code on this worksheet was later used to get more data for my chrojobs.  Keep this worksheet because it is the only place where I have code for searching Twitter for "vacation X."  This worksheet is also the only place where I pull some of the other data that was put in my MySQL server.  e.g. baby names.

In v2, we allow for 800 tweets per destination.  This will require two 15-minute Twitter API windows.  Also, maybe we'll not worry about location of the user (to allow for more users).

In [1]:
from twython import Twython
import pandas as pd
import re
import datetime
import matplotlib as plt
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

from mysql_login_info import sql_username, sql_password
from twitter_keys import APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET

In [61]:
# Twitter authentication

twitter = Twython(APP_KEY, APP_SECRET,
                  OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

In [2]:
# Import a list of vacation destinations

destinations=[]
df_destinations=pd.read_csv('vacation_destinations.txt', header=None, names=['Destination'])
destinations = df_destinations['Destination'].values.tolist()

40 destinations imported


['Berlin', 'Bora Bora', 'Budapest', 'Cancun', 'Chicago']

In [55]:
# Search Twitter for "vacation "X""
# see https://dev.twitter.com/rest/reference/get/search/tweets

# count=100 is the maximum allowed amount
# 180 queries of this type are allowed per 15 minutes

queries_per_destination = 8

# Search the first 20 destinations
destination_results={}
for destination in destinations[:20]:
    destination_results[destination]=[]
    destination_results[destination].append(twitter.search(q='vacation "'+destination+'"', lang='en', count=100, result_type='recent'))
    for i in range(1,queries_per_destination):
        if len(destination_results[destination][i-1]['statuses']) < 100:
            break
        max_id=min([status['id'] for status in destination_results[destination][i-1]['statuses']])-1
        destination_results[destination].append(twitter.search(q='vacation '+destination, lang='en', count=100, result_type='recent', max_id=max_id))

print datetime.datetime.today().__str__()

2015-09-29 21:12:34.147102


In [62]:
# Search the next 20 destinations.
# wait 15 minutes to do this.
for destination in destinations[20:]:
    destination_results[destination]=[]
    destination_results[destination].append(twitter.search(q='vacation "'+destination+'"', lang='en', count=100, result_type='recent'))
    for i in range(1,queries_per_destination):
        if len(destination_results[destination][i-1]['statuses']) < 100:
            break
        max_id=min([status['id'] for status in destination_results[destination][i-1]['statuses']])-1
        destination_results[destination].append(twitter.search(q='vacation '+destination, lang='en', count=100, result_type='recent', max_id=max_id))
        
print datetime.datetime.today().__str__()

2015-09-29 21:30:49.950681


In [63]:
# Organize the results in preparation for putting them in a
# pandas dataframe.
# Use regex to find the first name of each user

destinations_col = []
names_col = []
first_names_col = []
locations_col = []
texts_col = []
screen_names_col = []
ids_col = []
times_col = []
descriptions_col = []
statuses_cnt_col = []

for destination in destinations:
    for result in destination_results[destination]:
        for status in result['statuses']:
            destinations_col.append(destination)
            screen_names_col.append(status['user']['screen_name'])
            name = status['user']['name']
            names_col.append(name)
            match = re.search('^\w[a-z]*', name)
            if match:
                first_names_col.append(match.group())
            else:
                first_names_col.append(None)
            locations_col.append(status['user']['location'])
            texts_col.append(status['text'])
            ids_col.append(status['id']) # or use ['id_str']
            times_col.append(status['created_at'])
            descriptions_col.append(status['user']['description'])
            statuses_cnt_col.append(status['user']['statuses_count'])

In [64]:
# Put the results in a Pandas dataframe  
df_tweets = pd.DataFrame({
        'Destination': destinations_col,
        'Screen Name': screen_names_col,
        'Full Name': names_col,
        'First Name': first_names_col,
        'Home Location': locations_col, # Relabel this in later cells
        'Tweet': texts_col,
        'Tweet ID': ids_col,
        'Time': times_col,
        'Description': descriptions_col,
        'Status Count': statuses_cnt_col
    })

# Change the character encoding to utf-8.  Maybe use ascii instead?
for col in df_tweets.columns.values:
    if df_tweets[col].dtype=='object':
        df_tweets[col]=df_tweets[col].str.encode('utf-8', errors='ignore')

12210 rows


Unnamed: 0,Description,Destination,First Name,Full Name,Home Location,Screen Name,Status Count,Time,Tweet,Tweet ID
0,,Berlin,Voyagester,Voyagester,,voyagester,296565,Tue Sep 29 22:49:08 +0000 2015,Cheap flights to Berlin http://t.co/oCddnnuFE...,648992935397191681
1,,Berlin,Voyagester,Voyagester,,voyagester,296565,Tue Sep 29 19:49:06 +0000 2015,Easyjet flight from Rome to Berlin http://t.c...,648947629943033856


In [4]:
# connect to my local mySQL server and open the vacation database
engine = create_engine('mysql+mysqldb://'+sql_username+':'+sql_password+'@127.0.0.1:3306/vacation', echo=False)

In [67]:
# Save the data on my mySQL server

# copy the pandas dataframe data into the 'vacation_tweets' table on MySQL
df_tweets.to_sql('vacation_tweets', engine, if_exists='replace', index=False)

True


In [6]:
# Import a list of baby names and put it on my MySQL server

df_names = pd.read_csv('baby-names2.csv', usecols=['name'])
df_names = df_names.drop_duplicates(['name'])
print len(df_names.index), 'baby names names imported'

# Save the data on my mySQL server
df_names.to_sql('baby_names', engine, if_exists='replace', index=False)

6782 baby names names imported


In [9]:
# Import a list of cities and states in the US
# and put it on my MySQL server

cities = pd.read_csv('cities.csv', usecols=['city']).values.tolist()
df_states = pd.read_csv('state_table.csv', usecols=['name', 'abbreviation'])
states = df_states['name'].values.tolist()
states_abbrev = df_states['abbreviation'].tolist()
df_all = pd.DataFrame({'location':cities+states+states+states_abbrev})

# Save the data on my mySQL server
df_all.to_sql('home_locations', engine, if_exists='replace', index=False)

29623 cities and states in the US


In [9]:
# Summarize the results.

# Total number of tweets for each location
total = df_tweets.groupby(['Destination']).size()

# Filter out tweets with locations that aren't
# US states or cities
df_tweets_has_loc = df_tweets[df_tweets['Home Location']!='']
df_tweets_has_loc = df_tweets_has_loc[df_tweets_has_loc['Home Location'].str.contains(pattern)]
has_loc = df_tweets_has_loc.groupby(['Destination']).size()

# Filter out tweets with names not in the baby names list
has_name = df_tweets[df_tweets['First Name'].isin(df_names.index)].groupby(['Destination']).size()
df_tweets_has_name = df_tweets[df_tweets['First Name'].isin(df_names.index)]

# Now filter both locations and names
df_both = df_tweets_has_loc[df_tweets_has_loc['First Name'].isin(df_names.index)]
has_both = df_both.groupby(['Destination']).size()

df_summary = pd.DataFrame({
        'Total': total,
        'with Location': has_loc,
        'with Name': has_name,
        'with Both': has_both})
df_summary = df_summary[['Total','with Location','with Name','with Both']]

In [98]:
# Copy the pandas dataframe data into the 'vacation_tweets' table on mySQL
df_tweets_has_name.to_sql('vacation_tweets_w_name', engine, if_exists='replace', index=False)

True
