# Part 1 -- Setup & Collection

Pull Tweets from the Twitter API and collect **all Tweets from 30 tech thought leaders and news outlets**.

### Load lib codes:

In [7]:
!pwd

/home/jovyan/work/Portfolio/predicting_stock_market_trends_with_Twitter


In [8]:
from os import chdir
chdir('/home/jovyan/work/Portfolio/predicting_stock_market_trends_with_Twitter/')

from lib import *
suppress_warnings()

### Create database with PyMongo and install Tweepy:

In [3]:
!pip install pymongo tweepy

Collecting pymongo
  Downloading pymongo-3.4.0-cp35-cp35m-manylinux1_x86_64.whl (359kB)
[K    100% |████████████████████████████████| 368kB 1.2MB/s ta 0:00:01
[?25hCollecting tweepy
  Downloading tweepy-3.5.0-py2.py3-none-any.whl
Collecting requests-oauthlib>=0.4.1 (from tweepy)
  Downloading requests_oauthlib-0.8.0-py2.py3-none-any.whl
Collecting oauthlib>=0.6.2 (from requests-oauthlib>=0.4.1->tweepy)
  Downloading oauthlib-2.0.2.tar.gz (125kB)
[K    100% |████████████████████████████████| 133kB 3.3MB/s ta 0:00:01
[?25hBuilding wheels for collected packages: oauthlib
  Running setup.py bdist_wheel for oauthlib ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/84/98/7a/fba7268f61097bea6081cbe5480bc439b38975748ea7684fd5
Successfully built oauthlib
Installing collected packages: pymongo, oauthlib, requests-oauthlib, tweepy
Successfully installed oauthlib-2.0.2 pymongo-3.4.0 requests-oauthlib-0.8.0 tweepy-3.5.0
[33mYou are using pip version 8.1.2, however ver

In [10]:
import pymongo
cli = pymongo.MongoClient(host='35.163.253.8')

In [11]:
cli.drop_database('twitter_db')

In [12]:
# Create a reference. This is not going to instantiate until you put data in it:
twitter_db = cli.twitter_db
cli.database_names()

['admin', 'local']

In [13]:
twitter_collection = cli.twitter_db.twitter_collection
cli.database_names()

['admin', 'local']

### Pull data from Twitter API:

In [1]:
import tweepy

ImportError: No module named 'tweepy'

In [2]:
from lib.twitter_keys import my_keys

ImportError: No module named 'lib'

In [16]:
#Variables that contains the user credentials to access Twitter API 
access_token = my_keys['ACCESS_TOKEN']
access_token_secret = my_keys['ACCESS_SECRET']
consumer_key = my_keys['CONSUMER_KEY']
consumer_secret = my_keys['CONSUMER_SECRET']

In [18]:
twitter_30_df = pd.read_csv('/home/jovyan/work/Portfolio/predicting_stock_market_trends_with_Twitter/data/twitter_users_30.csv')
twitter_30_df.sample(3)

Unnamed: 0,Name,Handle,Followers,Description,Type,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
23,Carl Icahn,Carl Icahn,332000,"Chairman of Icahn Enterprises L.P.; etc., etc....",Thought Leader,,,,
16,Business Insider Tech,SAI,1440000,The latest tech news from @BusinessInsider.,News Outlet,,,,
14,Forbes Tech News,ForbesTech,2600000,Tech news and insights from @Forbes.,News Outlet,,,,


In [19]:
handle = list(twitter_30_df['Handle'])

In [20]:
handle

['BarackObama',
 'cnnbrk',
 'BillGates',
 'realDonaldTrump',
 'TheEconomist',
 'aplusk',
 'HillaryClinton',
 'TechCrunch',
 'elonmusk',
 'NewYorker',
 'mcuban',
 'jack',
 'MarketWatch',
 'CNBC',
 'ForbesTech',
 'sacca',
 'SAI',
 'paulg',
 'themotleyfool',
 'ReformedBroker',
 'StockTwits',
 'cnntech',
 'MONEY',
 'Carl Icahn',
 'sgblank',
 'investorslive',
 'markflowchatter',
 'MarkYusko',
 'FinancialTimes',
 'tim_cook']

**Problem:** Most codes on the internet use the Access Token Auth method, which is limited to 180 Requests/15 mins limit (18,000 tweets/15 mins). If you download 18K tweets before 15 mins, you won’t be able to get any more results until your 15 min window expires and you search again.

**Solution:** Use Application only Auth instead of the Access Token Auth. Application only auth has higher limits - 450 request/sec (45,000 tweets/15-min), which is 2.5 times more than the Access Token Limit.

The secret is the AppAuthHandler instead of the more frequent OAuthHandler which you find being used in lots of code samples. This sets up App-only Auth and gives you higher limits. Also as an added bonus notice the wait_on_rate_limit & wait_on_rate_limit_notify flags set to true. What this does is make the Tweepy API call auto wait (sleep) when it hits the rate limit and continue upon expiry of the window. This avoids you to have to program this part manually, which as you’ll shortly see makes your program much more simple and elegant.

In [21]:
handle = ['aplusk', 'BarackObama']

In [49]:
handle

['aplusk', 'BarackObama']

In [26]:
import sys

# Replace the API_KEY and API_SECRET with your application's key and secret:
auth = tweepy.AppAuthHandler(my_keys['CONSUMER_KEY'], my_keys['CONSUMER_SECRET'])

# Authorize twitter, initialize tweepy
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print ("Can't Authenticate")
    sys.exit(-1)

In [31]:
searchQuery = '#python'  # this is what we're searching for
maxTweets = 10000000 # Some arbitrary large number
tweetsPerQry = 100  # this is the max the API permits
fName = 'tweets.txt' # We'll store the tweets in a text file.


# If results from a specific ID onwards are reqd, set since_id to that ID.
# else default to no lower limit, go as far back as API allows
sinceId = None

# If results only below a specific ID are, set max_id to that ID.
# else default to no upper limit, start from the most recent tweet matching the search query.
max_id = -1E10

tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open(fName, 'w') as f:
    while tweetCount < maxTweets:
        try:
            if (max_id <= 0):
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
                else:
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
                                            since_id=sinceId)
            else:
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
                                            max_id=str(max_id - 1))
                else:
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
            if not new_tweets:
                print("No more tweets found")
                break
            for tweet in new_tweets:
                outtweets = [[tweet.user.screen_name, tweet.created_at, tweet.text.encode("utf-8")]]
                writer = csv.writer(f)
                writer.writerow(["id","created_at","text"])
                writer.writerows(outtweets)
            tweetCount += len(new_tweets)
            print("Downloaded {0} tweets".format(tweetCount))
            max_id = new_tweets[-1].id
        except tweepy.TweepError as e:
            # Just exit if any error
            print("some error : " + str(e))
            break

print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))

Downloading max 10000000 tweets
Downloaded 100 tweets
Downloaded 200 tweets
Downloaded 300 tweets
Downloaded 400 tweets
Downloaded 500 tweets
Downloaded 600 tweets
Downloaded 700 tweets
Downloaded 800 tweets
Downloaded 900 tweets
Downloaded 1000 tweets
Downloaded 1100 tweets
Downloaded 1200 tweets
Downloaded 1300 tweets
Downloaded 1400 tweets
Downloaded 1500 tweets
Downloaded 1600 tweets
Downloaded 1700 tweets
Downloaded 1800 tweets
Downloaded 1900 tweets
Downloaded 2000 tweets
Downloaded 2100 tweets
Downloaded 2200 tweets
Downloaded 2300 tweets
Downloaded 2400 tweets
Downloaded 2500 tweets
Downloaded 2600 tweets
Downloaded 2700 tweets
Downloaded 2800 tweets
Downloaded 2900 tweets
Downloaded 3000 tweets
Downloaded 3100 tweets
Downloaded 3200 tweets
Downloaded 3300 tweets
Downloaded 3400 tweets
Downloaded 3500 tweets
Downloaded 3600 tweets
Downloaded 3700 tweets
Downloaded 3800 tweets
Downloaded 3900 tweets
Downloaded 4000 tweets
Downloaded 4100 tweets
Downloaded 4200 tweets
Downloaded 

In [46]:
# import csv

# maxTweets = 10000000 # Some arbitrary large number
# tweetsPerQry = 200  # this is the max the API permits
# fName = 'tweets.csv' # We'll store the tweets in a csv file.


# # If results from a specific ID onwards are reqd, set since_id to that ID.
# # else default to no lower limit, go as far back as API allows
# sinceId = None

# # If results only below a specific ID are, set max_id to that ID.
# # else default to no upper limit, start from the most recent tweet matching the search query.
# max_id = -1E10

# tweetCount = 0
# print("Downloading max {0} tweets".format(maxTweets))

# with open(fName, 'w') as f:
#     while tweetCount < maxTweets:
#         try:
#             for h in handle:
#                 if (max_id <= 0):
#                     if (not sinceId):
#                         new_tweets = api.user_timeline(screen_name = h, count=tweetsPerQry)
#                     else:
#                         new_tweets = api.user_timeline(screen_name = h, count=tweetsPerQry,
#                                             since_id=sinceId)
#                 else:

#                     if (not sinceId):
#                         new_tweets = api.user_timeline(screen_name = h, count=tweetsPerQry,
#                                             max_id=str(max_id - 1))
#                     else:
#                         new_tweets = api.user_timeline(screen_name = h, count=tweetsPerQry,
#                                             max_id=str(max_id - 1),
#                                             since_id=sinceId)
#                 if not new_tweets:
#                     print("No more tweets found")
#                     break
#                 for tweet in new_tweets:
#                     outtweets = [[tweet.user.screen_name, tweet.created_at, tweet.text.encode("utf-8")]]
#                     writer = csv.writer(f)
#                     writer.writerow(["id","created_at","text"])
#                     writer.writerows(outtweets)
              
#             tweetCount += len(new_tweets)
#             print("Downloaded {0} tweets".format(tweetCount))
#             max_id = new_tweets[-1].id  

#         except tweepy.TweepError as e:
#             # Just exit if any error
#             print("some error : " + str(e))
#             break

# print ("\n Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))

Downloading max 10000000 tweets
Downloaded 200 tweets
Downloaded 400 tweets
Downloaded 600 tweets
Downloaded 800 tweets
Downloaded 1000 tweets
Downloaded 1200 tweets
Downloaded 1400 tweets
Downloaded 1600 tweets
Downloaded 1800 tweets
Downloaded 2000 tweets
Downloaded 2200 tweets
Downloaded 2399 tweets
Downloaded 2599 tweets
Downloaded 2796 tweets
Downloaded 2996 tweets
Downloaded 3196 tweets
Downloaded 3233 tweets
No more tweets found
Downloaded 3233 tweets


IndexError: list index out of range

In [None]:
all_tweets = pd.read_csv('tweets.csv')
all_tweets['id'].unique()

In [None]:
!rm tweets.csv