In [1]:
# ! pip install flair
# ! pip install emoji

In [1]:
import pandas as pd

import re
import string
import emoji

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('average_perception_tagger')
# nltk.download('wordnet')
# nltk.download('words') # if its needed

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English 
nlp = English()

from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import numpy as np

In [2]:
tweet_data = pd.read_csv('data.csv')
tweet_data.head()

Unnamed: 0,tweet_id,created_at,tweet,like_count,quote_count,reply_count,retweet_count
0,1551734038204923904,2022-07-26 00:59:59+00:00,$2.7 billion for climate change (slashing carb...,15,1,0,6
1,1551734021591269377,2022-07-26 00:59:55+00:00,@nathaliejacoby1 Climate change. The rise in t...,2,0,0,0
2,1551734013815029761,2022-07-26 00:59:53+00:00,@JacobsVegasLife @LasVegasLocally This is a ch...,8,0,1,0
3,1551733993740980224,2022-07-26 00:59:48+00:00,Climate Change and Energy Minister Chris Bowen...,18,0,8,5
4,1551733979316887554,2022-07-26 00:59:45+00:00,"@Thebs15800518 At 5:30, @SecGranHolm tries to ...",0,0,0,0


In [3]:
tweet_data = tweet_data.drop_duplicates()
#remove any empty tweets
# tweet_data = tweet_data[tweet_data['text'] != '']
tweet_data.shape

(146069, 7)

In [4]:
tweet_data = tweet_data[tweet_data['tweet']!='tweet']

In [5]:
# tweet_data = tweet_data.rename(columns={'text':'tweet'})

### compare tokenisers

In [5]:
nltk.word_tokenize(tweet_data.tweet[0])

['$',
 '2.7',
 'billion',
 'for',
 'climate',
 'change',
 '(',
 'slashing',
 'carbon',
 'emissions',
 ')',
 ';',
 '$',
 '37',
 'billion',
 'for',
 'cops',
 '.',
 'Tell',
 'me',
 'again',
 'why',
 'republicans',
 'aren',
 '’',
 't',
 'in',
 'love',
 'with',
 'Joe',
 'Biden',
 '?',
 'I',
 'mean',
 ',',
 'at',
 'least',
 'when',
 'it',
 'comes',
 'to',
 'this',
 '?']

In [6]:
tokenizer = Tokenizer(nlp.vocab)
tokens = tokenizer(tweet_data.tweet[0])

for token in tokens:
    print(token)

$2.7
billion
for
climate
change
(slashing
carbon
emissions);
$37
billion
for
cops.
Tell
me
again
why
republicans
aren’t
in
love
with
Joe
Biden?
I
mean,
at
least
when
it
comes
to
this?


### Clean the data

In [7]:
df_clean = tweet_data.copy()

# make all tweet text lower case 
df_clean['clean'] = df_clean.tweet.str.lower()

# remove any links from tweets 
df_clean.clean = df_clean.clean.apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))

# remove mentions of other twitter users
df_clean.clean = df_clean.clean.apply(lambda x: re.sub(r'@\S+', '', x))

# remove any html character reference 
df_clean.clean = df_clean.clean.apply(lambda x: re.sub(r'&\S+;', '', x))

# remove new line 
df_clean.clean = df_clean.clean.apply(lambda x: re.sub(r'\n', '', x))

# remove new line 
df_clean.clean = df_clean.clean.apply(lambda x: re.sub("\u2026", "", x))

# remove any emojis
df_clean.clean = df_clean.clean.apply(lambda x: emoji.replace_emoji(x, replace=''))

In [8]:
# extract hashtags from tweet before removing punctuation 
df_clean['hashtags2'] = df_clean.clean.apply(lambda x: [word for word in x.split(' ') if word.startswith('#')])

In [9]:
# removes most punctuation
df_clean.clean = df_clean.clean.apply(lambda x: re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', x))

# remove white space 
df_clean.clean = df_clean.clean.apply(lambda x: x.strip())
df_clean.clean = df_clean.clean.apply(lambda x: ' '.join(x.split()))

In [10]:
df_clean

Unnamed: 0,tweet_id,created_at,tweet,like_count,quote_count,reply_count,retweet_count,clean,hashtags2
0,1551734038204923904,2022-07-26 00:59:59+00:00,$2.7 billion for climate change (slashing carb...,15,1,0,6,2 7 billion for climate change slashing carbon...,[]
1,1551734021591269377,2022-07-26 00:59:55+00:00,@nathaliejacoby1 Climate change. The rise in t...,2,0,0,0,climate change the rise in temperature will be...,[]
2,1551734013815029761,2022-07-26 00:59:53+00:00,@JacobsVegasLife @LasVegasLocally This is a ch...,8,0,1,0,this is a chilling podcast about what could ha...,[]
3,1551733993740980224,2022-07-26 00:59:48+00:00,Climate Change and Energy Minister Chris Bowen...,18,0,8,5,climate change and energy minister chris bowen...,[]
4,1551733979316887554,2022-07-26 00:59:45+00:00,"@Thebs15800518 At 5:30, @SecGranHolm tries to ...",0,0,0,0,at 5 30 tries to hide the fact that biden bega...,"[#biden, #oil, #buildbackbetter]"
...,...,...,...,...,...,...,...,...,...
167717,1554044266657288192,2022-08-01 10:00:00+00:00,#JCiTTweets Research from #Austria on \ndistin...,0,0,0,0,jcittweets research from austria on distinct p...,"[#jcittweets, #austria, #skiing, #seasonality,..."
167725,1554044097794813952,2022-08-01 09:59:20+00:00,@ChrisPenknz What do you actually do? What's t...,12,0,0,0,what do you actually do what s the national pa...,[]
167729,1554044032938041345,2022-08-01 09:59:04+00:00,Elite greens keep deploying different versions...,136,0,11,87,elite greens keep deploying different versions...,[]
167792,1554042860424626176,2022-08-01 09:54:25+00:00,"in 1997, a similar tragedy befell the city of ...",11,0,1,2,in 1997 a similar tragedy befell the city of m...,[]


Compare tokenisers now data is clean

In [11]:
nltk.word_tokenize(df_clean['clean'][0])

['2',
 '7',
 'billion',
 'for',
 'climate',
 'change',
 'slashing',
 'carbon',
 'emissions',
 '37',
 'billion',
 'for',
 'cops',
 'tell',
 'me',
 'again',
 'why',
 'republicans',
 'aren',
 '’',
 't',
 'in',
 'love',
 'with',
 'joe',
 'biden',
 'i',
 'mean',
 'at',
 'least',
 'when',
 'it',
 'comes',
 'to',
 'this']

In [12]:
tokenizer = Tokenizer(nlp.vocab)
tokens = tokenizer(df_clean['clean'][0])

for token in tokens:
    print(token)

2
7
billion
for
climate
change
slashing
carbon
emissions
37
billion
for
cops
tell
me
again
why
republicans
aren’t
in
love
with
joe
biden
i
mean
at
least
when
it
comes
to
this


In [None]:
from gensim.models.phrases import Phrases, Phraser

sent = [row.split() for row in df_clean['clean_spacy']]
phrases = Phrases(sent, min_count=1000, progress_per=100)
sentences = Phraser(phrases)[sent]

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

sorted(word_freq, key=word_freq.get, reverse=True)[:30]

### Remove stop words and lemmatize

In [13]:
df_clean.clean.head()

0    2 7 billion for climate change slashing carbon...
1    climate change the rise in temperature will be...
2    this is a chilling podcast about what could ha...
3    climate change and energy minister chris bowen...
4    at 5 30 tries to hide the fact that biden bega...
Name: clean, dtype: object

The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form.
Lemmatisation is closely related to stemming. The difference is that a stemmer operates on a single word without knowledge of the context, and therefore cannot discriminate between words which have different meanings depending on part of speech. However, stemmers are typically easier to implement and run faster, and the reduced accuracy may not matter for some applications.

In [14]:
# %%timeit
df_clean['clean_nltk'] = df_clean.clean.apply(lambda x: ' '.join([word for word in x.split(' ') if word not in stopwords.words('english')]))

lemmatizer = WordNetLemmatizer()
df_clean.clean_nltk = df_clean.clean_nltk.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]))

df_clean.clean_nltk.head()

0    2 7 billion climate change slashing carbon emi...
1    climate change rise temperature bad enough sec...
2    chilling podcast could happen salt lake city g...
3    climate change energy minister chris bowen hit...
4    5 30 try hide fact biden began signing legisla...
Name: clean_nltk, dtype: object

In [15]:
# %%timeit
nlp= spacy.load('en_core_web_sm')
df_clean['clean_spacy'] = df_clean.clean.apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop]))
df_clean.clean_spacy.head()

0    2 7 billion climate change slash carbon emissi...
1    climate change rise temperature bad secondary ...
2    chill podcast happen salt lake city great salt...
3    climate change energy minister chris bowen hit...
4    5 30 try hide fact biden begin sign legislatio...
Name: clean_spacy, dtype: object

In [17]:
# lemmatizer = WordNetLemmatizer()
# df_clean.clean = df_clean.clean.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split(' ')]))
# df_clean.clean.head()

### Frequently used words

In [None]:
# removes most punctuation
# df_clean.clean = df_clean.clean.apply(lambda x: re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', x))

In [16]:
word_freq = defaultdict(int)
for sent in df_clean.clean_nltk.apply(lambda x: re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', x)):
    sent = sent.split(' ')
    for i in sent:
        word_freq[i] += 1
        
for word in sorted(word_freq, key=word_freq.get, reverse=True)[:20]:
    print(word, word_freq[word])

climate 142122
change 133902
’ 37963
s 16639
climatechange 13422
people 13016
u 11522
t 11043
like 9848
year 9378
world 8723
“ 7881
need 7747
” 7686
one 7557
global 7471
would 6800
get 6703
time 6688
it 6599


In [17]:
word_freq = defaultdict(int)
for sent in df_clean.clean_spacy.apply(lambda x: re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', x)):
    sent = sent.split(' ')
    for i in sent:
        word_freq[i] += 1
        
for word in sorted(word_freq, key=word_freq.get, reverse=True)[:20]:
    print(word, word_freq[word])

climate 142117
change 136112
 40259
s 25260
t 15320
climatechange 13437
people 13014
like 9911
year 9385
world 8723
need 8470
go 7591
global 7472
think 7451
cause 7317
know 6833
time 6512
fight 6190
bill 6134
real 6015


### Sentiment model using flair 

In [18]:
from flair.models import TextClassifier
from flair.data import Sentence
sia = TextClassifier.load('en-sentiment')

2022-08-03 14:56:51,978 loading file /Users/hannahbrown/.flair/models/sentiment-en-mix-distillbert_4.pt


In [19]:
def flair_prediction(x):
    sentence = Sentence(x)
    sia.predict(sentence)
    score = sentence.labels[0]
    if "POSITIVE" in str(score):
        return "pos"
    elif "NEGATIVE" in str(score):
        return "neg"
    else:
        return "neu"


In [20]:
df_clean = df_clean[(df_clean['clean_nltk']!='') & (df_clean['clean_spacy']!='')]

In [22]:
sent_spacy = df_clean["clean_spacy"].apply(flair_prediction)

In [None]:
sent_nltk = df_clean["clean_nltk"].apply(flair_prediction)

In [None]:
# for i, tweet in enumerate(df_clean['tweet'][:100][sent=='pos']): 
#     print(i, tweet)

In [None]:
df_clean['sentiment_spacy'] = sent_spacy
df_clean['sentiment_nltk'] = sent_nltk

In [None]:
df_clean['created_at'] = pd.to_datetime(df_clean['created_at'], format="%Y-%m-%d %H:%M:%S")
df_clean = df_clean.set_index('created_at')

In [None]:
width = 0.3
pos = df_clean[df_clean['sentiment_spacy']=='pos'].resample('D').agg('count')['tweet']
neg = df_clean[df_clean['sentiment_spacy']=='neg'].resample('D').agg('count')['tweet']
plt.bar(np.arange(len(neg)), neg, width=width,label='negative sentiment')
plt.bar(np.arange(len(pos))+width, pos, width=width, label='positive sentiment')
plt.legend()
plt.gcf().set_size_inches(20,5)

In [None]:
width = 0.3
pos = df_clean[df_clean['sentiment_nltk']=='pos'].resample('D').agg('count')['tweet']
neg = df_clean[df_clean['sentiment_nltk']=='neg'].resample('D').agg('count')['tweet']
plt.bar(np.arange(len(neg)), neg, width=width,label='negative sentiment')
plt.bar(np.arange(len(pos))+width, pos, width=width, label='positive sentiment')
plt.legend()
plt.gcf().set_size_inches(20,5)

In [None]:
different_sent = df_clean[df_clean['sentiment_spacy']!=df_clean['sentiment_nltk']]

In [None]:
# tweets that where identified as positive after spaCy preprocessing and negative after nltk preprocessing
for tweet in different_sent[different_sent['sentiment_spacy']=='pos'].tweet[:10]:
    print(tweet)

In [None]:
for tweet in different_sent[different_sent['sentiment_spacy']=='neg'].tweet[:10]:
    print(tweet)

In [33]:
# what are the top 5 most used hashtags on each day? Do these say much about what was happening on those days? 

In [34]:
# df_clean[df_clean['sentiment']=='neg'].resample('D').agg('count')['tweet']

In [35]:
# all_hashtags = sum([tag.split('#') for tag in df_clean[df_clean['sentiment']=='pos']['hashtags2'].sum()], [])
# all_hashtags = [tag for tag in all_hashtags if tag]  # removes empty strings from splitting hashtags

# top_hashtags = Counter(all_hashtags).most_common(10)
# top_tags = [x[0] for x in top_hashtags]
# top_tags_freq = [x[1] for x in top_hashtags]

# fig, ax = plt.subplots(figsize = (12,12))
# y_pos = np.arange(len(top_tags))
# ax.barh(y_pos ,list(top_tags_freq)[::-1], align='center', color='steelblue', edgecolor='black', linewidth=1)
# ax.set_yticks(y_pos)
# ax.set_yticklabels(list(top_tags)[::-1])
# ax.set_xlabel("Number of appearances")
# ax.set_title("Most used #hashtags in tweets with a positive sentiment", fontsize = 20)
# plt.tight_layout(pad=3)

In [36]:
# all_hashtags = sum([tag.split('#') for tag in df_clean[df_clean['sentiment']=='neg']['hashtags2'].sum()], [])
# all_hashtags = [tag for tag in all_hashtags if tag]  # removes empty strings from splitting hashtags

# top_hashtags = Counter(all_hashtags).most_common(10)
# top_tags = [x[0] for x in top_hashtags]
# top_tags_freq = [x[1] for x in top_hashtags]

# fig, ax = plt.subplots(figsize = (12,12))
# y_pos = np.arange(len(top_tags))
# ax.barh(y_pos ,list(top_tags_freq)[::-1], align='center', color='steelblue', edgecolor='black', linewidth=1)
# ax.set_yticks(y_pos)
# ax.set_yticklabels(list(top_tags)[::-1])
# ax.set_xlabel("Number of appereances")
# ax.set_title("Most used #hashtags in tweets with a negative sentiment ", fontsize = 20)
# plt.tight_layout(pad=3)