In [116]:
import pandas as pd
import numpy as np
import seaborn as sns
import tweepy
from tweepy import OAuthHandler
import yaml
import os
import json
import searchtweets
import ast
import gensim
import re
%matplotlib inline

In [2]:
## open twitter authentication details
with open('config.yaml', 'r') as file:
    twitter_auth = yaml.load(file)

In [3]:
# Twitter auth details
consumer_key = twitter_auth['consumer_key']
consumer_secret= twitter_auth['consumer_secret']
access_token = twitter_auth['access_token']
access_token_secret = twitter_auth['access_token_secret']

In [4]:
# Using tweepy to connect to twitter (not useful anymore)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
 
api = tweepy.API(auth)

In [5]:
# Decided to use twitters searchtweets api 
# setting my search arguements using the premium search feature
premium_search_args = searchtweets.load_credentials(".twitter_keys.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)

Grabbing bearer token from OAUTH


In [25]:
premium_search_args_femi = searchtweets.load_credentials(".twitter_keys_femi.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)

Grabbing bearer token from OAUTH


In [26]:
def retrieve_tweets(keyword="buhari", no_results=100, from_date="", to_date="", 
                    premium_search_args=premium_search_args_femi):
    """
    Get tweets matching a certain keyword during a certain date range and return them as a list.
    """
    rule = searchtweets.gen_rule_payload(keyword,
                        from_date=from_date,
                        to_date=to_date,
                        results_per_call=no_results)
    rs = searchtweets.ResultStream(rule_payload=rule,
                  max_results=no_results,
                  max_pages=no_results,
                  **premium_search_args)
    tweets = list(rs.stream())
    return tweets

In [27]:
def create_tweets_df(dates_list, keyword="buhari", from_date="", to_date="", no_results=100):
    "Create a dataframe from the tweets covering the dates in the dates_list"
    tweets = []
    for date in dates_list:
        tweets += [*retrieve_tweets(keyword, no_results, from_date=date+" 15:00", to_date=date+" 16:00")]
    tweets_df = pd.DataFrame(tweets)
    tweets_df.dropna(subset=['text'], inplace=True)
    tweets_df.created_at = pd.to_datetime(tweets_df.created_at)
    tweets_df['created_date'] = tweets_df.created_at.apply(lambda x : x.strftime("%b %Y"))
    tweets_df.sort_values(by='created_at', inplace=True)
    tweets_df['text_shortened'] = tweets_df.text.apply(lambda x: x[:25].lower())
    tweets_df.drop_duplicates(subset=['text_shortened', 'created_date'], inplace=True)
    tweets_df.drop(labels=['text_shortened'], axis=1, inplace=True)
    return tweets_df

<br>

<br>

In [12]:
tweets_df = pd.read_csv("./data/buhari_tweets.csv")
tweets_df.shape

(2786, 37)

In [None]:
# tweets_df.user = tweets_df.user.apply(lambda x: ast.literal_eval(x))
# tweets_df['username'] = tweets_df.user.apply(lambda x: x['name'])
# tweets_df['user_location'] = tweets_df.user.apply(lambda x: x['location'])

In [64]:
dates_list = ["2017-01-05", "2016-01-05",
              "2018-03-05", "2017-03-05", "2016-03-05",
              "2017-06-05", "2016-06-05",
              "2017-09-05", "2016-09-05",
              "2017-11-05", "2016-11-05", "2015-11-05"]

new_tweets = create_tweets_df(dates_list)
new_tweets.shape

(759, 37)

In [65]:
tweets_df = pd.concat([tweets_df, new_tweets])
tweets_df.reset_index(drop=True, inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


In [66]:
tweets_df.shape

(6002, 37)

In [67]:
tweets_df.to_csv("./data/buhari_tweets.csv", index=False)

In [68]:
tweets_df_2 = tweets_df[['created_at', 'created_date', 'text', 'user']]

In [69]:
tweets_df_2.tail(10)

Unnamed: 0,created_at,created_date,text,user
5992,2018-03-05 15:59:35,Mar 2018,GEN. BUHARI (rtd.) SPEAKS ON FUEL SUBSIDY REMO...,"{'id': 58883208, 'id_str': '58883208', 'name':..."
5993,2018-03-05 15:59:43,Mar 2018,RT @TheOsasuShow: BREAKING: After much pressur...,"{'id': 947072341275762688, 'id_str': '94707234..."
5994,2018-03-05 15:59:44,Mar 2018,RT @jacksonpbn: Buhari is a useless President....,"{'id': 2336569835, 'id_str': '2336569835', 'na..."
5995,2018-03-05 15:59:44,Mar 2018,"Are we still on this matter, under Buhari will...","{'id': 151055829, 'id_str': '151055829', 'name..."
5996,2018-03-05 15:59:51,Mar 2018,RT @timehinlekan: The best political calculati...,"{'id': 234064106, 'id_str': '234064106', 'name..."
5997,2018-03-05 15:59:53,Mar 2018,RT @AnthonyEhilebo: I heard you people have no...,"{'id': 3302993783, 'id_str': '3302993783', 'na..."
5998,2018-03-05 15:59:56,Mar 2018,RT @MobilePunch: #Punchnewspapers #Opinion\n ...,"{'id': 1506814836, 'id_str': '1506814836', 'na..."
5999,2018-03-05 15:59:57,Mar 2018,RT @Asiwaju_limited: Retweet for puff puff lik...,"{'id': 86312012, 'id_str': '86312012', 'name':..."
6000,2018-03-05 15:59:57,Mar 2018,"Buhari Set To Visit Benue Taraba, Zamfara And ...","{'id': 4853245847, 'id_str': '4853245847', 'na..."
6001,2018-03-05 15:59:59,Mar 2018,CybokNews Now: (Buhari meets Liberian Presiden...,"{'id': 2990841970, 'id_str': '2990841970', 'na..."


In [70]:
by_date = tweets_df_2.groupby(by='created_date').count()

In [71]:
by_date

Unnamed: 0_level_0,created_at,text,user
created_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apr 2015,56,56,56
Apr 2016,41,41,41
Apr 2017,46,46,46
Aug 2015,42,42,42
Aug 2016,55,55,55
Aug 2017,42,42,42
Aug 2018,82,82,82
Feb 2015,60,60,60
Feb 2016,38,38,38
Feb 2017,52,52,52


In [72]:
tweets_df_2.to_csv("./data/buhari_tweets_short.csv", index=False)

In [74]:
tweets_df_2.head()

Unnamed: 0,created_at,created_date,text,user
0,2015-04-30 16:54:00,Apr 2015,RT @countfreemont: Before elections: I will st...,"{'id': 38511618, 'id_str': '38511618', 'name':..."
1,2015-04-30 16:55:00,Apr 2015,"If Buhari Makes N1Naira Equal to 1Dollar, I’ll...","{'id': 2272812642, 'id_str': '2272812642', 'na..."
2,2015-04-30 16:55:00,Apr 2015,Charly Boy Vows To Ride Bike From Nigeria To C...,"{'id': 3078228369, 'id_str': '3078228369', 'na..."
3,2015-04-30 16:55:00,Apr 2015,@RadioPaparazi.paparazi radio buhari i dey fee...,"{'id': 3081268377, 'id_str': '3081268377', 'na..."
4,2015-04-30 16:56:00,Apr 2015,Decline in revenues due to fall in oil prices ...,"{'id': 2724933587, 'id_str': '2724933587', 'na..."


In [80]:
tweets_to_analyse = tweets_df_2[(tweets_df_2.created_date.str.contains("Jan")) | 
                                (tweets_df_2.created_date.str.contains("Mar")) |
                                (tweets_df_2.created_date.str.contains("Jun")) |
                                (tweets_df_2.created_date.str.contains("Sep")) |
                                (tweets_df_2.created_date.str.contains("Nov")) &
                                (tweets_df_2.created_date != 'Jan 2015') &
                                (tweets_df_2.created_date != 'Mar 2015')]

tweets_to_analyse.created_at = pd.to_datetime(tweets_to_analyse.created_at)
tweets_to_analyse.sort_values(by='created_at', inplace=True)
tweets_to_analyse = delete_duplicates(tweets_to_analyse)
tweets_to_analyse.reset_index(inplace=True, drop=True)

In [84]:
tweets_to_analyse.created_date.unique()

array(['Jan 2015', 'Jan 2016', 'Jan 2017', 'Jan 2018', 'Jun 2015',
       'Jun 2016', 'Jun 2017', 'Jun 2018', 'Mar 2015', 'Mar 2016',
       'Mar 2017', 'Mar 2018', 'Nov 2015', 'Nov 2016', 'Nov 2017',
       'Sep 2015', 'Sep 2016', 'Sep 2017', 'Sep 2018', 'Nov 2018'],
      dtype=object)

In [85]:
tweets_to_analyse.shape

(5005, 4)

In [103]:
def delete_duplicates(df, str_len=25):
    df['text_shortened'] = df.text.apply(lambda x: x[:str_len].lower())
    df.drop_duplicates(subset=['text_shortened', 'created_date'], inplace=True)
    df.drop(labels=['text_shortened'], axis=1, inplace=True)
    return df

In [105]:
tweets_to_analyse.shape

(3368, 4)

In [123]:
def preprocess_tweet(tweet):
    #Preprocess the text in a single tweet
    #arguments: tweet = a single tweet in form of string 
    #convert the tweet to lower case
    tweet.lower()
    #convert all urls to sting "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #convert all @username to "AT_USER"
    tweet = re.sub('@[^\s]+','AT_USER', tweet)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    #convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #convert RT to whote space
    tweet = re.sub(r'RT', '', tweet)
    return tweet.strip()

In [127]:
tweets_to_analyse.text = tweets_to_analyse.text.apply(preprocess_tweet)

### Train model on labelled twitter sentiment data
Next steps:
- Download Sentiment140 data
- preprocess tweet data
- Apply word embeddings to the text
- apply fasttext text classification
- Pickle model
- Apply model to buhari tweets text

In [138]:
def import_tweets(filename, header=None, encoding="ISO-8859-1"):
    #import data from csv file via pandas library
    tweet_dataset = pd.read_csv(filename, encoding=encoding, header=header)
    #the column names are based on sentiment140 dataset provided on kaggle
    tweet_dataset.columns = ['sentiment','id','date','flag','user','text']
    #delete 3 columns: flags,id,user, as they are not required for analysis
    for i in ['flag','id','user','date']: del tweet_dataset[i] # or tweet_dataset = tweet_dataset.drop(["id","user","date","user"], axis = 1)
    #in sentiment140 dataset, positive = 4, negative = 0; So we change positive to 1
    tweet_dataset.sentiment = tweet_dataset.sentiment.replace(4,1)
    return tweet_dataset

In [139]:
sentiment140 = import_tweets("./data/training.1600000.processed.noemoticon.csv")

In [143]:
sentiment140.sentiment.value_counts()

1    800000
0    800000
Name: sentiment, dtype: int64

In [145]:
sentiment140.text = sentiment140.text.apply(preprocess_tweet)