In [1]:
# import libraries
import pandas as pd, numpy as np
import tweepy
import random

from configparser import RawConfigParser

In [3]:
# we need to create a dataframe combining all the credibility-scored tweets that are broken dowen by topic.
# first, create an empty dataframe that will hold each individual tweet and relevant attributes
df_tweets = pd.DataFrame(columns = ['tweet_id','user_id','create_time','topic_key','is_credible'])

# use a for-loop to read data from CSVs in topic_tweets subfolder
# and concatenate them to the 'df_tweets' dataframe
for i in range(224):
    df_topic = pd.read_csv(f'../datasets/topic_tweets/topic_{i}.csv')
    df_tweets = pd.concat([df_tweets, df_topic])

In [4]:
# checking the class distribtuion for full dataset
# about 66% are classified as true, 34% rumours.
df_tweets.is_credible.value_counts(normalize=True)

1    0.665757
0    0.334243
Name: is_credible, dtype: float64

In [5]:
# the full dataset is too large for us to work with given computational limitations and time contraints.
# feel free to continue the analysis using the full dataset in df_tweets,
# or otherwise you can follow these steps to work with a subset of the data.
# note, however, that you will get a different set of randomly generated numbers here
# so your final dataset will contain different tweets.
# we use a dataset containing 1.5 million tweets
random_indices = random.sample(range(1,df_tweets.shape[0]),1500000)

In [7]:
# creating final dataframe using randomly selected indices
df = df_tweets.iloc[random_indices]

# and taking a look
print(df.shape)
df.head()

(1500000, 5)


Unnamed: 0,tweet_id,user_id,create_time,topic_key,is_credible
61074,555906559755452417,LoveDeedums,2015-01-16 01:57:06,birthday_king_martin-20150115_170445-20150115_...,1
56209,562882182310809600,pomaikai44,2015-02-04 07:55:44,plane_crash_taiwan-20150204_062513-20150204_07...,1
98170,552866326331797504,MAZANDARA,2015-01-07 16:36:18,#charliehebdo_paris_attack-20150107_072714-201...,1
8782,565678607461523457,Balshe79,2015-02-12 01:07:44,#chapelhillshooting_were_media-20150211_142604...,0
191070,557376485049511936,nataliapresli,2015-01-20 03:18:04,king_martin_luther-20150119_130240-20150119_13...,1


In [8]:
# finally, we give the tweet_ids in 'df' to the Twitter API using Tweepy, to pull the tweet text.
# first we need to authenticate our request by providing tokens.
# sign-up for Twitter API online and generate your own tokens and them to config.ini.
# refer to config.ini here to get authentication details
config = RawConfigParser()
config.read('config.ini')

bear_token = config['twitter']['bearer_token']

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

# initialize call to Twitter API v2, with authentication
client = tweepy.Client(
    bearer_token= bear_token,
    consumer_key= api_key,
    consumer_secret= api_key_secret,
    access_token= access_token,
    access_token_secret= access_token_secret,
    wait_on_rate_limit=True # set to true to ensure max of approx 10K tweets pulled per every 15 mins
)

In [9]:
# twitter API can only return tweets in batches of 100
# define function to split list of tweet IDs into a nested format, ie, a list of 100-item lists

def id_grouper(tweet_ids):
    grouped_ids = []
    for n in range(0, len(tweet_ids), 100):
        grouped_ids.append([tweet_ids.iloc[iden] for iden in range(n, n+100)])
        
    return grouped_ids

In [6]:
# loop to make repeated API calls.
# note: will pause once rate limit is exceeded.
# our dataset of 1.5M tweets took about 10 hours.
# about half of the tweet_ids have been removed from Twitter.
# as a result, text was fetched for about 50% of the 1.5 million we attempted.

tweet_text = []

for group in id_grouper(df.tweet_id):
    
    tweets = client.get_tweets(group)
    for i in range(len(tweets.data)):
        tweet_text.append([tweets.data[i].id, tweets.data[i].text])

Rate limit exceeded. Sleeping for 802 seconds.
Rate limit exceeded. Sleeping for 802 seconds.
Rate limit exceeded. Sleeping for 806 seconds.
Rate limit exceeded. Sleeping for 806 seconds.
Rate limit exceeded. Sleeping for 806 seconds.
Rate limit exceeded. Sleeping for 805 seconds.
Rate limit exceeded. Sleeping for 804 seconds.
Rate limit exceeded. Sleeping for 806 seconds.
Rate limit exceeded. Sleeping for 808 seconds.
Rate limit exceeded. Sleeping for 807 seconds.
Rate limit exceeded. Sleeping for 807 seconds.
Rate limit exceeded. Sleeping for 806 seconds.
Rate limit exceeded. Sleeping for 806 seconds.
Rate limit exceeded. Sleeping for 808 seconds.
Rate limit exceeded. Sleeping for 807 seconds.
Rate limit exceeded. Sleeping for 807 seconds.
Rate limit exceeded. Sleeping for 807 seconds.
Rate limit exceeded. Sleeping for 806 seconds.
Rate limit exceeded. Sleeping for 807 seconds.
Rate limit exceeded. Sleeping for 807 seconds.
Rate limit exceeded. Sleeping for 807 seconds.
Rate limit ex

In [None]:
# many of these are retweets, and so contain duplicate content.
# removing these leaves 344,208 unique tweets.
df_tweet_texts = pd.DataFrame(tweet_text, columns=['tweet_id','text'])
df_tweet_texts.drop_duplicates(inplace=True)

# merging our text data with credibility scores
df = pd.merge(df_tweets, df_tweet_texts, how='left', on='tweet_id').dropna()

# exporting these as final 
df.to_csv('../datasets/final_train_data.csv', index=False)