# Collect Tweets by Term

# Dependencies & Keys

## Load Dependencies:
 - tweepy (python library to access Twitter API)
 - json (display results in json format)
 - numpy (saving large dimensional arrays)
 - pandas (data frames)
 - vaderSentiment (Sentiment Analysis)

## Keys
 - Create a Twitter developer account and create an app to create custom keys

In [4]:
pip install tweepy

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/44/a3/1218a3b5651dbcba1699101c84e5c84c36cbba360d9dbf29f2ff18482982/vaderSentiment-3.3.1-py2.py3-none-any.whl (125kB)
[K     |████████████████████████████████| 133kB 3.2MB/s eta 0:00:01
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Dependencies
import tweepy
import json
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

## Keys
consumer_key = "xxxxx_change_me"
consumer_secret = "xxxxx_change_me"
access_token = "xxxxx_change_me"
access_token_secret = "xxxxx_change_me"

# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

# 1. Gather tweets

## Variable Inputs:

### search_term
-  Enter whatever custom search term you desire to search for...

### oldest_tweet
-  Enter in a tweet id for the most recent tweet id you want to search for, so all tweets gathered while executing will have a tweet id > oldest_tweet
---------------------------------------------------------------------------------------------------------------------
## Cell Outputs:

### Print the number of tweets collected.
-  If you are not hitting any query limits, this code should collect 5,000 tweets.

### Print the total number of items in the TempDict list.
-  This value should match the total number of tweets collected (counter).

### Print the total number of unique tweet ids'.
-  This value should also match the total number of tweets and number of items in the TempDict list.

In [4]:
#---------------------------------
# Execute Search
#---------------------------------

# Search for People Tweeting about Mark Hamill
search_term = "#EarthDay"

# Create variable for holding the oldest tweet
oldest_tweet = 1252611119996420096

# List to hold unique IDs
unique_ids = []
TempDict = []

# Counter to keep track of the number of tweets retrieved
counter = 0

# Loop through 5 times (total of 5000 tweets)
for x in range(50):

    # Retrieve 100 most recent tweets -- specifying a max_id
    public_tweets = api.search(search_term, 
                               count=100, 
                               result_type="recent", 
                               max_id=oldest_tweet)

    # Print Tweets
    for tweet in public_tweets["statuses"]:
        tweet_id = tweet["id"]
        
        if tweet_id not in unique_ids:
            unique_ids.append(tweet_id)
            TempDict.append(tweet)

                        
            # Increase counter by 1
            counter += 1

        # Reassign the the oldest tweet (i.e. the max_id)
        # Subtract 1 so the previous oldest isn't included
        # in the new search
        oldest_tweet = tweet_id - 1
print(f"There were a total of {counter} tweets captured")
print(f"There are a total of {len(TempDict)} objects in the dictionary")
print(f"There are a total of {len(unique_ids)} unique ids")


There were a total of 5000 tweets captured
There are a total of 5000 objects in the dictionary
There are a total of 5000 unique ids


In [5]:
#########################
# Verify Run
########################

# Number of Unique ID's Collected
len(unique_ids)
print(f"Number of Unique IDs:            {len(unique_ids)}")

# Number of ID's Pulled in latest run
print(f"Number of ID's Pulled in Run:    {len(TempDict)}")

# Oldest Tweet Number
print(f"The current oldest tweet is:     {oldest_tweet}")

Number of Unique IDs:            5000
Number of ID's Pulled in Run:    5000
The current oldest tweet is:     1252585562344546303


# 2. Save Results

### Save json file
-  enter json file name in line 7 ("filename.json")

### Save csv file
-  enter csv file name in line 6 ('filename.csv')

### Save id numbers
-  enter npy file name ('filename.npy')

### Print a single full tweet response
-  You can change the integer in FirstEntries[0] to any number between 0 and the one number less than the number of tweets collected.
- i.e., if 5,000 tweets were collected, you can change the 0 to any number between 0 - 4999, or pick a range of numbers (0:5). Each tweet response is very large, so you probably only want to print a single tweet response just to verify what has been collected.

In [6]:
#########################
### Save TempDict as JSON File
#########################

import json
json978 = json.dumps(TempDict)
f = open("EarthDay.json", "w")
f.write(json978)
f.close
print("The json file was saved successfully")

The json file was saved successfully


In [7]:
#########################
### Create CSV File of DataFrame
#########################

TempDF = pd.DataFrame.from_dict(TempDict)
TempDF.to_csv('EarthDay.csv')
print("The json file was successfully saved as a CSV")

The json file was successfully saved as a CSV


In [8]:
#########################
### Saving Unique ID Numbers to Numpy File
#########################

# unique_ids.tofile('EarthDay_UniqueIds.dat')

np.save('EarthDay_UniqueIDs.npy', unique_ids)

In [9]:
#########################
### Print First Full Tweet Response to Inspect
#########################

FirstEntries = TempDict[0:10]
FirstEntries[0]

{'created_at': 'Tue Apr 21 14:52:18 +0000 2020',
 'id': 1252611119996420096,
 'id_str': '1252611119996420096',
 'text': 'The advice her mother gave her at 10 years old, is the same advice Jane Goodall shares with young people today… https://t.co/VVKPvNzX93',
 'truncated': True,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [{'url': 'https://t.co/VVKPvNzX93',
    'expanded_url': 'https://twitter.com/i/web/status/1252611119996420096',
    'display_url': 'twitter.com/i/web/status/1…',
    'indices': [112, 135]}]},
 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},
 'source': '<a href="https://studio.twitter.com" rel="nofollow">Twitter Media Studio</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 17471979,
  'id_str': '17471979',
  'name': 'National Geographic',
  'screen_name': 'NatGeo',
  'location': '

In [None]:
#################################################################################
#################################################################################
####### NOT for running - this is notes and the legand
#################################################################################

#     tweet['created_at']                               # Time Stamp of when tweet was created
#     tweet['id']                                       # tweet id Object (number)
#     tweet['id_str']                                   # tweet id String format (number)
#     tweet['text']                                     # text of tweet
#     tweet['entities']['hashtags']['text']             # hashtags taken
#     tweet['entities']['user_mentions']['screen_name'] # screen name of person mentioned
#     tweet['entities']['user_mentions']['name']        # name of person mentioned
#     tweet['user']['id']                               # id (object) of account user
#     tweet['user']['name']                             # name of account user
#     tweet['user']['screen_name']                      # Screen name of person
#     tweet['user']['location']                         # string, user input of their location
#     tweet['user']['description']                      # description of the account user
#     tweet['user']['followers_count']                  # number of accounts user is following
#     tweet['user']['friends_count']                    # number of accounts user is friends with
#     tweet['user']['verified']                         # is the account user 'verified'
#     tweet['geo']                                      # is geo null or on
#     tweet['coordinates']                              # coordinates or null
#     tweet['place']                                    # tweet place description or null
#     tweet['retweeted_status']['id']                   # Original tweet id number object
#     tweet['retweeted_status']['id_str']               # Original tweet id number string
#     tweet['retweet_count']                            # number of times an original tweet has been retweeted
#     tweet['retweeted_status']['favorite_count']       # number of times a tweet has been favorited