<a href="https://colab.research.google.com/github/james-hughes1/wdss-nlp-project/blob/main/NLP_Research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the Data

In [None]:
# Import libraries

import pandas as pd
import re
import datetime

In [None]:
# Add the .csv file to the workspace or this won't work

tweets_df = pd.read_csv('Twitter Scrape 2.csv', usecols=["ID", "Date", "Likes", "Replies", "Retweets", "User ID", "Username", "Followers", "Verified", "Content"],
                        nrows=900000, date_parser = pd.to_datetime, parse_dates=["Date"], encoding="UTF-8")

In [None]:
tweets_df

Unnamed: 0,ID,Date,Likes,Replies,Retweets,User ID,Username,Followers,Verified,Content
0,1448438445546770432,2021-10-14 00:00:10+00:00,1,0,0,760449739,CBE_MIIS,1545,0,Bold #climate decisions from Congress in thes...
1,1448438526098481157,2021-10-14 00:00:29+00:00,1,0,2,1288369706961973250,BetterFuturesAU,839,0,"In the lead-up to #COP26, the #BetterFuturesAU..."
2,1448438536332627971,2021-10-14 00:00:32+00:00,0,0,0,321986074,EINGazpromNews,1357,0,COP26 Warning: World's Listed Companies to Cau...
3,1448438610290782211,2021-10-14 00:00:50+00:00,4,0,2,18781150,FreshEnergy,10064,0,What role does Minnesota play at the UN Climat...
4,1448438763022082050,2021-10-14 00:01:26+00:00,0,0,0,55635996,Scotland4me,6308,0,Media release: Rail and road freight industrie...
...,...,...,...,...,...,...,...,...,...,...
899995,1459930326499741700,2021-11-14 17:04:48+00:00,0,0,0,3048999762,Scilla_Seventy,335,0,#COP26 failed
899996,1459930359727075333,2021-11-14 17:04:56+00:00,0,0,0,1159445850730455045,EcologyAfricaF1,1133,0,Greta Thunberg dismisses COP26 pact: 'The real...
899997,1459930376407855106,2021-11-14 17:05:00+00:00,1,1,0,50132801,padmorious,7813,0,What bollocks will #JohnsonTheCorruptPM feed u...
899998,1459930382468538369,2021-11-14 17:05:02+00:00,1,0,1,19870225,LindsayGrahamUK,8409,0,#COP26 in 2 minutes https://t.co/rBKpIS3h3T


In [28]:
tweets_df.iloc[:5,-1]

0    Bold #climate decisions from Congress  in thes...
1    In the lead-up to #COP26, the #BetterFuturesAU...
3    What role does Minnesota play at the UN Climat...
4    Media release: Rail and road freight industrie...
Name: Content, dtype: object

In [None]:
# Use regex to clean tweets.

tweets_clean = []

for i in range(100000):
  tweet_raw = tweets_df.iloc[i,-1]
  tweet_edit_1 = re.sub('\n|http\S+', '', tweet_raw.lower().strip())
  tweet_edit_2 = re.sub('&amp;', 'and', tweet_edit_1)
  tweet_edit_3 = re.sub('\s*[.,:;-]\s*', ' ', tweet_edit_2)
  tweet_edit_4 = re.sub('[^a-z0-9 @#%$£]', '', tweet_edit_3)
  tweets_clean.append(tweet_edit_4)

In [29]:
tweets_clean[:5]

['bold #climate decisions from congress in these coming weeks can impact the lives of americans and put the u s in a stronger position to support the goals of the #parisagreement #cop26 #climateaction #climatecrisis',
 'in the lead up to #cop26 the #betterfuturesau local government sector working group is hosting a session on 18 october to discuss what cop26 means for local governments and how aus councils can get involved register for the event here #innovate4cities ',
 'what role does minnesota play at the un climate change conference cop26 what is the u s bringing to the table at the global climate summit get answers to these questions and more next week at our behind the scenes pre cop26 webinar see you there ',
 'media release rail and road freight industries pull together for cop26 trade event series of images available  contact alice@grm agencyan event raising awareness about the transition to net zero in the rail and road freight industries has just bee']

In [30]:
# Import libraries

import random
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

## Frequency analysis

In [None]:
# Preprocessing hyperparameters.

vocab_size = 20000
max_length = 24

# Tokenize text data, converting to integers.

tokenizer = Tokenizer(oov_token="<OOV>",
                      num_words=vocab_size,
                      filters = '')
tokenizer.fit_on_texts(tweets_clean)

tweet_sequences = tokenizer.texts_to_sequences(tweets_clean)

In [45]:
dict(tokenizer.word_counts).items()



In [44]:
list(zip(dict(tokenizer.word_counts)))

[('bold',),
 ('#climate',),
 ('decisions',),
 ('from',),
 ('congress',),
 ('in',),
 ('these',),
 ('coming',),
 ('weeks',),
 ('can',),
 ('impact',),
 ('the',),
 ('lives',),
 ('of',),
 ('americans',),
 ('and',),
 ('put',),
 ('u',),
 ('s',),
 ('a',),
 ('stronger',),
 ('position',),
 ('to',),
 ('support',),
 ('goals',),
 ('#parisagreement',),
 ('#cop26',),
 ('#climateaction',),
 ('#climatecrisis',),
 ('lead',),
 ('up',),
 ('#betterfuturesau',),
 ('local',),
 ('government',),
 ('sector',),
 ('working',),
 ('group',),
 ('is',),
 ('hosting',),
 ('session',),
 ('on',),
 ('18',),
 ('october',),
 ('discuss',),
 ('what',),
 ('cop26',),
 ('means',),
 ('for',),
 ('governments',),
 ('how',),
 ('aus',),
 ('councils',),
 ('get',),
 ('involved',),
 ('register',),
 ('event',),
 ('here',),
 ('#innovate4cities',),
 ('worlds',),
 ('listed',),
 ('companies',),
 ('cause',),
 ('temperature',),
 ('rise',),
 ('3anddeg',),
 ('c',),
 ('role',),
 ('does',),
 ('minnesota',),
 ('play',),
 ('at',),
 ('un',),
 ('clima

In [55]:
freq_analysis_df = pd.DataFrame(data = dict(tokenizer.word_counts).items(),
                                columns = ["Word", "Count"]).sort_values(by="Count",ascending=False)

In [58]:
freq_analysis_df.head(50)

Unnamed: 0,Word,Count
11,the,111095
22,to,92990
15,and,62310
13,of,51878
26,#cop26,45634
19,a,39815
5,in,39443
47,for,33646
45,cop26,33594
37,is,32817
