In [1]:
import tweepy
import pandas as pd
import json

In [2]:
def download_twitter(searchQuery, fName, maxTweets = float('inf'), tweetsPerQry = 100):
    """
    download tweets with twitter API and store it in a txt file.
    :param searchQuery:  this is what we're searching for.
           "Bitcoin - #Bitcoin" means filtering out #Bitcoin
    :param maxTweets: Some arbitrary large number
    :param tweetsPerQry: how many tweets to get per page, 100 is the max the API permits
    :param fName: We'll store the tweets in a text file.
    :return: None
    """

    # If results from a specific ID onwards are reqd, set since_id to that ID.
    # else default to no lower limit, go as far back as API allows
    # Twitter return tweets from the most recent one, sinceId means stopping Id, max_id means start Id.
    sinceId = None

    # If results only below a specific ID are, set max_id to that ID.
    # else default to no upper limit, start from the most recent tweet matching the search query.
    max_id = -1

    tweetCount = 0
    print("Downloading max {0} tweets".format(maxTweets))
    with open(fName, 'w') as f:
        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry,tweet_mode='extended',lang = 'en')
                    else:
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry,tweet_mode='extended',lang = 'en',
                                                since_id=sinceId)
                else:
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry, tweet_mode='extended',lang = 'en',
                                                max_id=str(max_id - 1))
                    else:
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry, tweet_mode='extended',lang = 'en',
                                                max_id=str(max_id - 1),
                                                since_id=sinceId)
                if not new_tweets:
                    print("No more tweets found")
                    break
                for tweet in new_tweets:
                    f.write(json.dumps(tweet._json) +
                            '\n')  #transform from dict to json
                tweetCount += len(new_tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id
            except tweepy.TweepError as e:
                # Just exit if any error
                print("some error : " + str(e))
                break

    print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))


In [29]:
def get_full_text(data):
    try:
        text = data['retweeted_status']['full_text']  #get full text for retweets
    except:
        text = data['full_text']
    return text


In [None]:
consumer_key = 'XaYjNh6s2sG5udPG6cEQVMp53'
consumer_secret = 'ghO9RHMHVqVdjYRsqwwyIoHJOCWDJrF3XNUnOtVAdZFpEIwzlG'

auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

download_twitter(searchQuery='#BTC', fName='tweets_BTC.txt')

Downloading max inf tweets
Downloaded 98 tweets
Downloaded 198 tweets
Downloaded 288 tweets
Downloaded 380 tweets
Downloaded 477 tweets
Downloaded 563 tweets
Downloaded 650 tweets
Downloaded 742 tweets
Downloaded 824 tweets
Downloaded 909 tweets
Downloaded 997 tweets
Downloaded 1089 tweets
Downloaded 1171 tweets
Downloaded 1253 tweets
Downloaded 1338 tweets
Downloaded 1417 tweets
Downloaded 1493 tweets
Downloaded 1573 tweets
Downloaded 1649 tweets
Downloaded 1719 tweets
Downloaded 1779 tweets
Downloaded 1833 tweets
Downloaded 1914 tweets
Downloaded 1987 tweets
Downloaded 2062 tweets
Downloaded 2122 tweets
Downloaded 2213 tweets
Downloaded 2306 tweets
Downloaded 2402 tweets
Downloaded 2497 tweets
Downloaded 2586 tweets
Downloaded 2686 tweets
Downloaded 2773 tweets
Downloaded 2868 tweets
Downloaded 2968 tweets
Downloaded 3059 tweets
Downloaded 3159 tweets
Downloaded 3259 tweets
Downloaded 3347 tweets
Downloaded 3442 tweets
Downloaded 3542 tweets
Downloaded 3637 tweets
Downloaded 3737 twe

In [32]:
 #Data Cleaning

all = pd.read_csv('tweets_BTC.txt',sep='\n',header = None)

all['data'] = all.apply(lambda x: json.loads(x[0]), axis=1)  # transform str to dict
all['id'] = all['data'].apply(lambda x: x['id_str'])  #string format
all['text'] = all['data'].apply(get_full_text)
all['date'] = all['data'].apply(lambda x: x['created_at'])
all['favorite_count'] = all['data'].apply(lambda x: x['favorite_count'])
all['in_reply_to_status_id'] = all['data'].apply(lambda x: x['in_reply_to_status_id_str']) #contain retweet with comment and reply, string format
all['screen_name'] = all['data'].apply(lambda x: x['user']['screen_name'])
all['followers_count'] = all['data'].apply(lambda x: x['user']['followers_count'])
all = all.drop(columns = [0,'data'])
all.to_csv('tweets_BTC.csv',encoding = 'utf-8-sig',index = False)  
