In [1]:
# Dependencies
import json
import time
import os
import requests
import twitter      # pip install python-twitter or twitter-python?
import numpy as np
import random

# Setting up Vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Setting up Mongo
import pymongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.tweets_db

In [2]:
# Loading Keys
api_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath('__file__'))))))
data = json.load(open(os.path.join(api_dir, "api_keys.json")))

# Setting Keys
consumer_key = data['twitter_consumer_key']
consumer_secret = data['twitter_consumer_secret']
access_token = data['twitter_access_token']
access_token_secret = data['twitter_access_token_secret']

In [3]:
# Setting up Auth
api = twitter.Api(consumer_key=consumer_key,
                  consumer_secret=consumer_secret,
                  access_token_key=access_token,
                  access_token_secret=access_token_secret)

In [4]:
# Tweet Gather function
def tweetGrab(city, city_dict, start_time):

    # Call to API - each new call should return new tweets and count against rate-limit
    tweets = api.GetSearch(geocode=[city_dict[f'{city}']['gps'][1],city_dict[f'{city}']['gps'][0], '20mi'],return_json=True,count=100)

    # Loop to parse tweet and append data needed
    for tweet in tweets['statuses']:
        
        # Temp dict do pass into function instead of whole data file
        temp_dict = {
            'tempText': [],
            'tempBox': [],
            'tempUser': [],
            'tempCreated_at': [],
            'tempFollowers_count': [],
            'temp_id': [],
            'tempComp': [],
            'tempCords': [],
            'tempImg': []         
        }

        # If loop to ensure unique tweets
        if tweet['id'] in city_dict[f'{city}']['data']['id']:
            continue
        else:
            # Temp variables
            temp_dict['tempText'].append(tweet['text'])
            temp_dict['tempBox'].append(solveBox(tweet['place']['bounding_box']['coordinates']))
            temp_dict['tempUser'].append(tweet['user']['screen_name'])
            temp_dict['tempCreated_at'].append(tweet['created_at'])
            temp_dict['tempFollowers_count'].append(tweet['user']['followers_count'])
            temp_dict['temp_id'].append(tweet['id'])
            temp_dict['tempComp'].append(analyzer.polarity_scores(tweet['text'])['compound'])
           
            # Creating try loops for variable data that may or may not show up
            try:
                temp_dict['tempCords'].append(tweet['coordinates']['coordinates'])
            except:
                temp_dict['tempCords'].append(tweet['coordinates'])

            try:
                temp_dict['tempImg'].append(tweet['user']['profile_image_url'])
            except:
                temp_dict['tempImg'].append('None')
            
            # Sending things over to Mongo for storage
            updateMongo(temp_dict, city)
            
            # Appending important data from temp dict
            city_dict[f'{city}']['data']['text'].append(temp_dict['tempText'][0])
            city_dict[f'{city}']['data']['bounding_box'].append(temp_dict['tempBox'][0])
            city_dict[f'{city}']['data']['user'].append(temp_dict['tempUser'][0])
            city_dict[f'{city}']['data']['created_at'].append(temp_dict['tempCreated_at'][0])
            city_dict[f'{city}']['data']['followers_count'].append(temp_dict['tempFollowers_count'][0])
            city_dict[f'{city}']['data']['comp_sent'].append(temp_dict['tempComp'][0])
            city_dict[f'{city}']['data']['id'].append(temp_dict['temp_id'][0])
            city_dict[f'{city}']['data']['coords'].append(temp_dict['tempCords'][0])
            city_dict[f'{city}']['data']['profile_image_url'].append(temp_dict['tempImg'][0])


            # Printing status of calls and current time
            if (len(city_dict[f'{city}']['data']['id'])%5 == 0):
                print(f'{city}: ' + str(len(city_dict[f'{city}']['data']['id'])) + ' --- %s seconds' % round(time.clock() - start_time,2))    

    # Ending function
    return()

In [5]:
# Calculating a rough coord based on bounding box
def solveBox(bounding_box):
    
    # Creating temp dict and variable to return
    coord_dict = {'lat': [], 'lon': []}
    coord_return = []

    # Small loop to append each coord to it's own list to sum
    for coord in bounding_box[0]:
        coord_dict['lat'].append(coord[1] + random.uniform(-.005,.005))
        coord_dict['lon'].append(coord[0] + random.uniform(-.005,.005))

    # Appending the sums to a return list
    coord_return.append(round(np.mean(coord_dict['lon']),6))
    coord_return.append(round(np.mean(coord_dict['lat']),6))

    # Return list of coords
    return(coord_return)

In [6]:
# Fills out the dictionary
def fillDict():
    
    # City dictionary with static coords
    city_dict = {
        'Dallas': {'gps':[-96.796988, 32.776664], 'data':{}},
        'St. Louis': {'gps':[-90.199404, 38.627003], 'data':{}},
        'Los Angeles': {'gps':[-118.243685, 34.052234], 'data':{}},
        'Atlanta': {'gps':[-84.387982, 33.748995], 'data':{}},
        'Chicago': {'gps':[-87.629798, 41.878114], 'data':{}},
        'Miami': {'gps':[-80.191790, 25.761680], 'data':{}},
        'New York': {'gps':[-74.005973, 40.712775], 'data':{}},
        'Kansas City': {'gps':[-94.578567, 39.099727], 'data':{}},
        'Seattle': {'gps':[-122.332071, 47.606210], 'data':{}},
        'Las Vegas': {'gps':[-115.139830, 36.169941], 'data':{}}
        }

    # Adding template to dictionary
    for city in city_dict:

        # Making lists for each entry
        city_dict[f'{city}']['data']['text'] = [] 
        city_dict[f'{city}']['data']['coords'] = []
        city_dict[f'{city}']['data']['bounding_box'] = []
        city_dict[f'{city}']['data']['user'] = []
        city_dict[f'{city}']['data']['profile_image_url'] = []
        city_dict[f'{city}']['data']['created_at'] = []
        city_dict[f'{city}']['data']['comp_sent'] = []
        city_dict[f'{city}']['data']['followers_count'] = []
        city_dict[f'{city}']['data']['id'] = []

    # Returning the dictionary
    return(city_dict)

In [7]:
# Function to compress limit check
def limit_check(check, city_dict, tweet_goal):
    
    # Creating empty check list
    check_arr = []
    
    # Checking if any of the lists have reached the limit
    if (check == 'first_to_goal'):
        for city in city_dict:
            if (len(city_dict[f'{city}']['data']['text'])>=tweet_goal):
                check_arr.append(True)
            else:
                check_arr.append(False)
                
        # Setting limit based on 'not any' logic
        limit = not any(check_arr)
    
    # Checking if all the lists have reached the limit
    if (check == 'all_to_goal'):
        for city in city_dict:
            if (len(city_dict[f'{city}']['data']['text'])<tweet_goal):
                check_arr.append(True)
            else:
                check_arr.append(False)   
        
        # Setting limit based on 'any' logic
        limit = any(check_arr)
    
    # Returning a bool to check against for loop
    return(limit)

In [8]:
# Save output to a txt/json doc for easy read later
def saveOutput(data, tweet_goal, start_time, limit_type):
    with open('data.txt', 'w') as outfile:
        json.dump(data, outfile, sort_keys = True, indent = 2)
    with open(f'{limit_type} - {tweet_goal} Tweets - ' + str(round(time.clock()-start_time,2)) + f' Runtime - {start_time}.txt', 'w') as outfile:
        json.dump(data, outfile, sort_keys = True, indent = 2)

In [9]:
# Print Final Output
def printOutput(city_dict, start_time):
    
    # Starting printing format
    print('-------------------------------------------')
    
    # Loop to print each list length
    for city in city_dict:
        print(f'{city}:' + str(len(city_dict[f'{city}']['data']['text'])))
    
    # Final Format Print with runtime
    print('-------------------------------------------')
    print('--- %s Runtime' % round(time.clock() - start_time,2))

In [10]:
def updateMongo(temp_dict, city):
    db.city.insert_one({
        'bounding_box': temp_dict['tempBox'][0], 
        'cords': temp_dict['tempCords'][0], 
        'comp_sent': temp_dict['tempComp'][0], 
        'created_at': temp_dict['tempCreated_at'][0],
        'followers_count': temp_dict['tempFollowers_count'][0],
        'profile_image_url': temp_dict['tempImg'][0],
        'text': temp_dict['tempText'][0],
        'user': temp_dict['tempUser'][0],
        'tweet_id': temp_dict['temp_id'][0],
        'city': city
    })

In [11]:
# Main Execution
def twitterize():
    
    # Setting up statics
    tweet_goal = 100
    limit_type = 'first_to_goal' # 'first_to_goal' or 'all_to_goal'
    start_time = time.clock()
    
    # Finding the dict
    city_dict = fillDict()

    # Main loop - Checks what type of limit is set and runs until False is returned
    while (limit_check(limit_type, city_dict, tweet_goal) == True):
        
        # Rotates cities - Causes a delay so new tweets can be fed to the API
        for city in city_dict:
            tweetGrab(city, city_dict, start_time)
            time.sleep(6)

    # Saving output to txt file
    saveOutput(city_dict, tweet_goal, start_time, limit_type)
    
    # Printing runtime
    printOutput(city_dict, start_time)

In [12]:
twitterize()

Dallas: 5 --- 0.63 seconds
Dallas: 10 --- 0.63 seconds
St. Louis: 5 --- 7.14 seconds
Chicago: 5 --- 26.19 seconds
Kansas City: 5 --- 45.67 seconds
Seattle: 5 --- 52.03 seconds
Las Vegas: 5 --- 58.37 seconds
Dallas: 15 --- 64.71 seconds
Dallas: 20 --- 64.71 seconds
Los Angeles: 5 --- 77.81 seconds
Atlanta: 5 --- 84.11 seconds
Chicago: 10 --- 90.41 seconds
Miami: 5 --- 96.7 seconds
New York: 5 --- 102.98 seconds
Kansas City: 10 --- 109.3 seconds
Seattle: 10 --- 115.58 seconds
Seattle: 15 --- 115.59 seconds
Las Vegas: 10 --- 121.89 seconds
Las Vegas: 15 --- 121.89 seconds
Dallas: 25 --- 128.3 seconds
Dallas: 30 --- 128.3 seconds
Dallas: 35 --- 128.3 seconds
St. Louis: 10 --- 134.59 seconds
St. Louis: 15 --- 134.59 seconds
Atlanta: 10 --- 147.24 seconds
Chicago: 15 --- 153.54 seconds
Miami: 10 --- 159.81 seconds
New York: 10 --- 166.13 seconds
Seattle: 20 --- 178.75 seconds
Las Vegas: 20 --- 185.08 seconds
Las Vegas: 25 --- 185.08 seconds
Las Vegas: 30 --- 185.09 seconds
Dallas: 40 --- 191

In [15]:
data = db.city.find()

# Iterate through each student in the collection
for tweet in data:
    # print(tweet)
    i =0

In [12]:
client.drop_database('tweets_db')

In [12]:
tweets = api.GetSearch(geocode=[32.776664, -96.796988, '100mi'],return_json=True, count=100)
tweets

{'search_metadata': {'completed_in': 0.113,
  'count': 100,
  'max_id': 991088214810284032,
  'max_id_str': '991088214810284032',
  'query': '',
  'refresh_url': '?since_id=991088214810284032&q=&geocode=32.776664%2C-96.796988%2C100mi&result_type=mixed&include_entities=1',
  'since_id': 0,
  'since_id_str': '0'},
 'statuses': [{'contributors': None,
   'coordinates': None,
   'created_at': 'Mon Apr 30 22:53:38 +0000 2018',
   'entities': {'hashtags': [],
    'media': [{'display_url': 'pic.twitter.com/sOqs57y1dq',
      'expanded_url': 'https://twitter.com/godsperfectionz/status/991088214810284032/photo/1',
      'id': 991088201086349312,
      'id_str': '991088201086349312',
      'indices': [74, 97],
      'media_url': 'http://pbs.twimg.com/tweet_video_thumb/DcENd8bVMAAdwv0.jpg',
      'media_url_https': 'https://pbs.twimg.com/tweet_video_thumb/DcENd8bVMAAdwv0.jpg',
      'sizes': {'large': {'h': 196, 'resize': 'fit', 'w': 350},
       'medium': {'h': 196, 'resize': 'fit', 'w': 350},
 

In [16]:
tweets['statuses'][0]

{'contributors': None,
 'coordinates': None,
 'created_at': 'Mon Apr 30 22:53:38 +0000 2018',
 'entities': {'hashtags': [],
  'media': [{'display_url': 'pic.twitter.com/sOqs57y1dq',
    'expanded_url': 'https://twitter.com/godsperfectionz/status/991088214810284032/photo/1',
    'id': 991088201086349312,
    'id_str': '991088201086349312',
    'indices': [74, 97],
    'media_url': 'http://pbs.twimg.com/tweet_video_thumb/DcENd8bVMAAdwv0.jpg',
    'media_url_https': 'https://pbs.twimg.com/tweet_video_thumb/DcENd8bVMAAdwv0.jpg',
    'sizes': {'large': {'h': 196, 'resize': 'fit', 'w': 350},
     'medium': {'h': 196, 'resize': 'fit', 'w': 350},
     'small': {'h': 196, 'resize': 'fit', 'w': 350},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'type': 'photo',
    'url': 'https://t.co/sOqs57y1dq'}],
  'symbols': [],
  'urls': [],
  'user_mentions': [{'id': 756278730,
    'id_str': '756278730',
    'indices': [0, 12],
    'name': 'h a l i e',
    'screen_name': 'hellohails_'}]},
 '