In [30]:
import pandas as pd
import json
import plotly.express as px
import plotly.graph_objects as go
import tweepy
import sys
from itertools import cycle

In [2]:
def load_credentials():
    """
    Load the credentials, save them to json file and return a dict containing them
    :return: dictionary containing your api key
    """
    api_token = {"API Key": "your API Key",
                  "API Secret Key": "your API Secret Key",
                  "Bearer token": "your Bearer token"}
    with open('key.json', 'w') as file:
        json.dump(api_token, file)
    return api_token

In [3]:
def oath1adance(api_key):
    """
    Perform the authentication for a given key
    :param api_key: the api key supplied to you by twitter
    :return: the verified api key
    """
    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(api_key['API Key'], api_key['API Secret Key'])
    try:
        # This will give a redirect link to access the application and will require a verification code
        redirect_url = auth.get_authorization_url()
        print(redirect_url)
    except tweepy.TweepError:
        print("Error! Failed to get request token.")
    verifier = input("Verifier: ")
    # Get access token
    auth.get_access_token(verifier)
    key = auth.access_token
    secret = auth.access_token_secret
    # Save the token to the key files to avoid doing this dance everytime
    api_key['access_token'] = key
    api_key['access_token_secret'] = secret
    return api_key

In [4]:
def get_auth(key):
    """
    check for authentication to twitter api
    :param key: key dict after performing the oath1adance
    :return: tweepy api object
    """
    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(key['API Key'], key['API Secret Key'])
    auth.set_access_token(key['access_token'], key['access_token_secret'])
    # Creation of the actual interface, using authentication
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    try:
        api.verify_credentials()
    except Exception:
        print(key, ": error during authentication")
        sys.exit('Exit')
    return api

In [5]:
def get_tweets(candidate_list, language='he', max_tweets_per_candidate=10000):
    """
    Given a set of candidates, download tweets containing their names and last name
    :param candidate_list: a list of candidates to download
    :param language: the chosen language of the tweets
    :param max_tweets_per_candidate: maximum number of tweets to download per candidate
    :return: (tweets, lens)
    tweets: a data frame containing all tweets
    lens: a dictionary containing the number of tweets received per candidate
    """
    # an empty list to store the tweets
    tweets = []
    # a new dictionary to store number of tweets per candidate
    lens = {}
    # maximum number of tweets to download for all candidates
    total_tweets = max_tweets_per_candidate * len(candidate_list)
    for candidate in candidate_list:
        # get tweets for each candidate full name and for last name
        search_last = False
        # get last name
        last_name = candidate.split(' ')[1]
        # keep track of the last tweet downloaded
        last_id = -1
        # initialize the dict with 0 tweets downloaded
        lens[candidate] = 0
        while len(tweets) < total_tweets:
            # the max amount to try and download each time, smaller batches are better
            count = min(total_tweets - len(tweets), 10000)
            try:
                if not search_last:
                    # get tweets containing the full name
                    new_tweets = api.search(q=f'{candidate}', count=count, lang=language, max_id=str(last_id - 1))
                    # if none were found try the last name
                    if not new_tweets:
                        search_last = True
                        continue
                else:
                    # get tweets containing the last name
                    new_tweets = api.search(q=f'{last_name}', count=count, lang=language, max_id=str(last_id - 1))
                    # if none found then finished with the user
                    if not new_tweets:
                        break
                # add the downloaded tweets to the tweets list
                tweets.extend(new_tweets)
                # save the last id to resume from it
                last_id = new_tweets[-1].id
                # save the new length
                lens[candidate] += len(new_tweets)
            except tweepy.TweepError as e:
                print(e)
                break
        # print how many tweets downloaded per candidate
        print(f'Got {lens[candidate]} Tweets for {candidate}!!')
    # extract the raw information from the tweets
    tweets = [tweet._json for tweet in tweets]
    # return a data frame containing all tweets
    return pd.DataFrame(data=tweets), lens

In [5]:
key = oath1adance(load_credentials())
api = get_auth(key)

In [2]:
candidate_list = ['יאיר לפיד', 'בני גנץ', 'בצלאל סמוטריץ', 'נפתלי בנט', 'מרב מיכאלי', 'גדעון סער',
              'בניימין נתניהו', 'ירון זליכה', 'ניצן הורוביץ', 'אריה דרעי', 'משה גפני',
              'אביגדור ליברמן', 'איימן עודה', 'מנסור עבאס']
language = 'he'
max_tweets_per_candidate = 30000

In [6]:
# get all tweets and lengths
df, total_tweets_per_candidate = get_tweets(candidate_list, language, max_tweets_per_candidate)
# save them for future use if you want
df.to_csv('candidates.csv', index=False)
# create a table containing a candidate name with it's tweets
sdf = pd.DataFrame([[name, total_tweets] for name, total_tweets in total_tweets_per_candidate.items()])
sdf.columns = ['name', 'total_tweets']
# save them for future use if you want
sdf.to_csv('candidates_final_counts.csv', index=False)


In [49]:
df = pd.read_csv('candidates_06_04_2021.csv')
print(f'We have a total of {len(df)} tweets')
# transform the tweet date string to a date object
df['created_at'] = df['created_at'].map(pd.to_datetime)
# set the date as an index
df.set_index('created_at', inplace=True)
# drop all duplicated tweets that have the same text
df.drop_duplicates(subset=['text'], inplace=True)
print(f'We have a total of {len(df)} tweets after dropping the duplicates')

We have a total of 40963 tweets
We have a total of 21388 tweets after dropping the duplicates


In [45]:
# search each tweet if it has a reference to any candidate
for candidate in candidate_list:
    df[candidate] = df.text.apply(lambda text: candidate in text)

In [46]:
# split the tweets by day
ddf = df.resample('D')
# sum the daily amount of tweets
sum_df = ddf.sum()[candidate_list].iloc[1:-1]
# plot the raw daily amount per candidate
fig = px.bar(sum_df, color_discrete_sequence=px.colors.qualitative.Alphabet)
fig.write_html('bar_plot.html')
fig.show()

In [47]:
palette = cycle(px.colors.qualitative.Alphabet)
# plot each candidate side by side
fig = go.Figure()
for candidate in candidate_list:
    fig.add_trace(go.Bar(
        x=sum_df.index,
        y=sum_df[candidate],
        name=candidate,
        marker_color=next(palette)
    ))
fig.update_layout(barmode='group')
fig.write_html('stacked_bar_plot.html')
fig.show()