In [1]:
import pandas as pd
import numpy as np
import re
import tweepy
import time

import nltk
#nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

from nltk.stem.lancaster import LancasterStemmer
lstem = LancasterStemmer()

import emoji

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold # import KFold

## Load kaggle dataset

https://www.kaggle.com/edqian/twitter-climate-change-sentiment-dataset

In [2]:
tweet_data = pd.read_csv("twitter_sentiment_data.csv")
n = len(tweet_data)

print(n)
tweet_data.head()

43943


Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


## Get additional features and uncorrupted text using tweepy

Many of the tweets in the orginal dataset appear to have corrupted and unreadable text, so we decided to use the tweet id's and Tweepy to find all the original tweets as well as the following additional featues: <br>
 - retweet (T/F)
 - quoted (T/F)
 - quoted text
 - date/time
 - retweet count
 - favorite count
 - hashtags
 - follower count
 - verified (T/F)
 - location
 - geographic coordinates
 - language

In [None]:
# Add your own credentials here
twitter_keys = {'consumer_key':        '',
                'consumer_secret':     '',
                'access_token_key':    '',
                'access_token_secret': ''}

# Setup access to API
auth = tweepy.OAuthHandler(twitter_keys['consumer_key'], twitter_keys['consumer_secret'])
auth.set_access_token(twitter_keys['access_token_key'], twitter_keys['access_token_secret'])

api = tweepy.API(auth,wait_on_rate_limit_notify=True)

In [None]:
# tweepy's api.get_status is limited to 900 calls every 15 minutes
# so we wrote our code to pause before hitting the rate limit

text = ['n']*n # n is size of dataframe
is_retweet = ['n']*n
is_quoted = ['n']*n
quoted_text = ['n']*n
date_time = ['n']*n
retweets = [-999]*n
favorites = [-999]*n
hashtags = ['n']*n
followers = [-999]*n
verified = [-999]*n
location = ['n']*n
coordinate = [-999]*n
language = ['n']*n

stop_here = np.arange(850,n,850) # need to stop every 900 calls, stopping every 850 to be safe

for i in range(n): 
    if i in stop_here: # stop before hitting rate limit
        time.sleep(15*60) # sleep for 15 minutes
        print('Sleeping!')
        
    try:
        tweet = api.get_status(tweet_data['tweetid'].iloc[i]) # create status object, this is 1 API call
        
        if hasattr(tweet, 'retweeted_status'): # retweet
            text[i] = tweet.text # tweet body
            is_retweet[i] = True # is a retweet
            is_quoted[i] = False # not a quote
            quoted_text[i] = '' # not a quote
            date_time[i] = tweet.created_at # time created
            retweets[i] = tweet.retweeted_status.retweet_count # retweet count
            favorites[i] = tweet.retweeted_status.favorite_count # favorite count
            hashtags[i] = tweet.retweeted_status.entities['hashtags'] # hashtags
            followers[i] = tweet.retweeted_status.user.followers_count # follower count
            verified[i] = tweet.retweeted_status.user.verified # verified
            location[i] = tweet.retweeted_status.user.location # location
            coordinate[i] = tweet.retweeted_status.coordinates # coordinates
            language[i] = tweet.lang # language  ##edited from tweet.retweeted_status.user.lang
            
        else: 
            text[i] = tweet.text # tweet body
            is_retweet[i] = False # not a retweet
            date_time[i] = tweet.created_at # time created
            retweets[i] = tweet.retweet_count # retweet count
            favorites[i] = tweet.favorite_count # favorite count
            hashtags[i] = tweet.entities['hashtags'] # hashtags
            followers[i] = tweet.user.followers_count # follower count
            verified[i] = tweet.user.verified # verified
            location[i] = tweet.user.location # location
            coordinate[i] = tweet.coordinates # coordinates
            language[i] = tweet.lang # language
            
            if tweet.is_quote_status: # quoted
                is_quoted[i] = True # is a quote
                quoted_text[i] = tweet.quoted_status.text #is a quote
            else:
                is_quoted[i] = False # not a quote
                quoted_text[i] = '' # not a quote 
    
    # some tweets are no longer available, skip 
    except tweepy.TweepError:
        pass 
    
    # if tweet.is_quote_status = True, but the quoted tweet has been deleted, skip
    except AttributeError:
        pass 

In [None]:
# augment the original dataframe with additional features

tweet_data['text'] = text
tweet_data['is_retweet'] = is_retweet
tweet_data['is_quoted'] = is_quoted
tweet_data['quoted_text'] = quoted_text
tweet_data['date_time'] = date_time
tweet_data['retweets'] = retweets
tweet_data['favorites'] = favorites
tweet_data['hashtags'] = hashtags
tweet_data['followers'] = followers
tweet_data['verified'] = verified
tweet_data['location'] = location
tweet_data['coordinate'] = coordinate
tweet_data['language'] = language

## Load augmented kaggle dataset

In [14]:
# the above code takes a while to run
# we each ran the code on one half of the data and saved the results
# here are the two halves combined into one dataset

aug1 = pd.read_csv('augmented_data1.csv')
aug2 = pd.read_csv('augmented_data2.csv')
aug_data = aug1.append(aug2)

print(len(aug1))
print(len(aug2))
print(len(aug_data))
print('Proportion missing tweets: ',round(len(aug_data[aug_data['text']=='n'])/len(aug_data),2))

21972
21971
43943
Proportion missing tweets:  0.29


## Cleaning the features

Around 29% of the tweets were no longer accessible (deleted, account on private, reported, etc.), so we had to decide how to clean the original corrupted text and how to impute the additional features.

### date/time
For date/time we found that the tweet id's were ordered by date, so we could impute by choosing the date/time inbetween the ones we did have. 

In [3]:
# replace 'n' with proper null value to prevent error in time conversion
# change date_time from string to pandas timestamp

aug_data['date_time'] = pd.to_datetime(aug_data['date_time'].replace('n',np.nan))

In [4]:
# sorting by tweetid appears to sort by date time

aug_data = aug_data.sort_values('tweetid').drop(['Unnamed: 0'],axis=1).reset_index(drop=True)

In [5]:
# find inbetween date/times

times = aug_data['date_time']
imputed_times = []
for i in range(len(times)): 
    if pd.isnull(times.iloc[i]): # if the time is null (NaT)
        if pd.isnull(times.iloc[i-1]): # if the previous time was also null
            imputed_times.append(imputed_times[-1]) # just use what we imputed for previous
        else: # if the previous time was not null
            time1 = times.iloc[i-1] # the previous time
            time2_i = times.iloc[i:].first_valid_index() # the index of the next non null time
            if time2_i == None: # if the remaining times are all null
                imputed_times.append(imputed_times[-1]) # use the previous time
            else:
                time2 = times.iloc[time2_i] # the next non null time
                diff = pd.Timedelta(time2 - time1).seconds # difference in times
                imputed_times.append(time1 + pd.to_timedelta(diff/2, unit='s')) # add half the time
    else: # if the time is not null just use what we have
        imputed_times.append(times.iloc[i])

aug_data['date_time'] = imputed_times

### dropping data
We removed all duplicate tweets as well as languages not identified as English.

In [6]:
# some entries appear to be duplicates (retweets of the same tweet)
# we do not really need these, the only thing that differs between them is the time of the
# retweet, so we kept the earliest time and removed duplicates

aug_data = aug_data.groupby("message", as_index=False).first()

In [7]:
# remove languages that we know are not english

aug_data = aug_data[(aug_data['language']=='en') | (aug_data['language']=='n')]

### boolean features
If a tweet started with "RT" we filled is_retweet = True. We found that most of the tweets were not quoted, so we filled the missing ones as is_quoted = False. Finally, we assumed that verified Twitter users carried more bias, so we filled in the missing values as is_verified = False. We additionally mapped True = 1 and False = 0 to prepare for model fitting.

In [8]:
# helper function to convert a column of string True False and bool True False into all bool

def str_to_bool(x):
    if type(x)==bool:
        return x
    else:
        if x == 'True':
            return True
        if x == 'False':
            return False

In [9]:
# fill in missing values for is_retweet

for i in aug_data.index:
    if aug_data['is_retweet'].loc[i] == 'n':
        tweet = aug_data['message'].loc[i]
        aug_data.at[i,'is_retweet'] = bool(re.search(r'^RT', tweet))

# convert boolean to 0 and 1
aug_data['is_retweet'] = aug_data['is_retweet'].apply(str_to_bool)*1

In [10]:
# fill in missing values for is_quoted

for i in aug_data.index:
    if aug_data['is_quoted'].loc[i] == 'n':
        aug_data.at[i,'is_quoted'] = False
        
# convert boolean to 0 and 1
aug_data['is_quoted'] = aug_data['is_quoted'].apply(str_to_bool)*1

In [11]:
# fill in missing values for is_verified

for i in aug_data.index:
    if aug_data['verified'].loc[i] == '-999':
        aug_data.at[i,'verified'] = False
        
# convert boolean to 0 and 1
aug_data['verified'] = aug_data['verified'].apply(str_to_bool)*1

### natural language processing

We did several things to clean the tweet text:
 - expand acronyms
 - convert emoticons to the textual emotion they convey (ex. :-) to happy)
 - convert emojis to the text
 - remove username, RT, and url
 - lowercase
 - count number of question and exclamation marks
 - remove punctuation
 - remove stop words
 - stem words

In [12]:
# dictionaries of common emoticons and acronyms

special_df = pd.read_csv("special.csv")
acronym_df = pd.read_csv("acronyms.csv")

special_dict = pd.Series(special_df['0'].values,index=special_df['1']).to_dict()
acronym_dict = pd.Series(acronym_df['0'].values,index=acronym_df['1']).to_dict()

In [13]:
# cleaning the tweet message
# not sure if separating hashtags out is beneficial so leaving them in as normal words for now

tokens = []
question_marks = []
exclamation_marks = []
for i in range(len(aug_data)):
    if aug_data['text'].iloc[i] == 'n': # if we were unable to retrieve the original tweet
        txt = aug_data['message'].iloc[i] # use the original dataset's text
    else:
        if aug_data['is_quoted'].iloc[i] == True: # if it's a quoted tweet
            txt = aug_data['text'].iloc[i] # only using comment (not quoted text)
        else:
            txt = aug_data['text'].iloc[i]
        
    ct = re.sub(r'@[A-Z0-9a-z_:]+','',txt) # remove username
    ct = re.sub(r'^RT','',ct) # remove RT
    ct = re.sub('https?://[A-Za-z0-9./]+','',ct) # remove urls
    ct = emoji.demojize(ct) # change emojis into text
    ct = ct.lower() # make lowercase
    
    for key,value in special_dict.items(): # go through all special characters (emoticons etc)
        ct = re.sub(r'%s'%re.escape(key),value,ct) # if tweet contains special character replace it
    for key,value in acronym_dict.items():
        ct = re.sub(r'\b%s\b'%key,value,ct) 
        
    q_mark = ct.count('?') # number of question marks
    e_mark = ct.count('!') # number of exclamation marks
    ct = re.sub("[']", "",ct) # contractions remove '
    ct = re.sub("[^a-zA-Z_]", " ",ct) # remove punctuation
    
    token = re.findall(r'\S+', ct) # tokenize
    token = [lstem.stem(word) for word in token if word not in stop_words] # stemming and removing stop words
    
    tokens.append(token)
    question_marks.append(q_mark)
    exclamation_marks.append(e_mark)
    
aug_data['tokens'] = tokens
aug_data['exclamation_mark_count'] = question_marks
aug_data['question_mark_count'] = exclamation_marks

### numerical features

We imputed missing numerical values with the median rather than the mean since there were many extreme values. This is a pretty big simplification, so if it seems as though these features give poor performace we will try other imputation methods such as KNN. 

In [14]:
# imputing retweet, favorite, and follower counts

retweets_med = np.median(aug_data[aug_data['retweets']!=-999]['retweets'])
aug_data['retweets'] = aug_data['retweets'].replace(-999,retweets_med)

favorites_med = np.median(aug_data[aug_data['favorites']!=-999]['favorites'])
aug_data['favorites'] = aug_data['favorites'].replace(-999,favorites_med)

followers_med = np.median(aug_data[aug_data['followers']!=-999]['followers'])
aug_data['followers'] = aug_data['followers'].replace(-999,followers_med)

### dropping data

We removed empty strings after cleaning as well as duplicates. We also decided to drop class 2 from our analysis. Based on our own judgement, it was ambiguous as to whether many of the class 2 tweets really linked to factual news about climate change. Furthermore, we removed the URL's in the tweets which largley reduced the meaning of this category.

In [15]:
# remove tokens that are empty lists

aug_data = aug_data[aug_data['tokens'].map(lambda d: len(d)) > 0]

In [16]:
# creating clean text column

aug_data['clean text'] = aug_data['tokens'].apply(lambda x: ' '.join(x))

In [17]:
# dropping duplicates again after cleaning

aug_data = aug_data.sort_values('date_time').groupby('clean text',as_index=False).first()

In [18]:
# removing category 2 fromn our analysis

aug_data = aug_data[aug_data['sentiment']!=2]

In [20]:
# helper function to specify whether features were imputed or not

def imputed(x):
    if x == 'n':
        return True
    else:
        return False

In [21]:
# add column for imputed feature

aug_data['imputed'] = aug_data['text'].apply(imputed)

In [22]:
# formatting issue with this row so dropping

aug_data = aug_data.drop(12652)

### saving cleaned data

In [23]:
clean_data = aug_data[['clean text',
                       'tweetid',
                       'is_retweet',
                       'is_quoted',
                       'date_time',
                       'retweets',
                       'favorites',
                       'followers',
                       'verified',
                       'location',
                       'exclamation_mark_count', 
                       'question_mark_count',
                       'imputed',
                       'sentiment']]

In [24]:
clean_data = clean_data.astype({"tweetid": str})

In [25]:
clean_data.dtypes

clean text                        object
tweetid                           object
is_retweet                         int64
is_quoted                          int64
date_time                 datetime64[ns]
retweets                           int64
favorites                          int64
followers                          int64
verified                           int64
location                          object
exclamation_mark_count             int64
question_mark_count                int64
imputed                             bool
sentiment                          int64
dtype: object

In [27]:
# clean_data.to_csv('clean_data.csv')