In [83]:
# Import packages

import numpy as np
import pandas as pd
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk import RegexpParser
import re

In [84]:
# Reading and subsetting the data

df = pd.read_excel('C:/Users/User/Desktop/Python/Projects/Labour_Party_Conf_Tweets/MASTERLBRPTY.xlsx')

twt = pd.DataFrame(df["tweet"])

**What is a tokenizer? Just a way
to group language e.g words, sentences, paras**

**What is a corpora - body of text. In this case, tweets**

**lexicon - words and their means
Defining word and sentence tokenizer**

**a stopword is a word that is a filler / doesn't carry
much meaning on its own**

# # # Preprocessing

In [85]:
# *Preprocessing*

# Tokenize tweet rows

twt['word_token'] = twt['tweet'].apply(nltk.word_tokenize)

twt['sentence_token'] = twt['tweet'].apply(nltk.sent_tokenize)

# # Defining stopwords

stop = set(stopwords.words("english"))

twt['word_token_proc'] = twt['word_token'].apply(
    lambda x: [item for item in x if item not in stop])

twt.head(5)

Unnamed: 0,tweet,word_token,sentence_token,word_token_proc
0,"Looking at the escalating chaos, it is hard to...","[Looking, at, the, escalating, chaos, ,, it, i...","[Looking at the escalating chaos, it is hard t...","[Looking, escalating, chaos, ,, hard, disagree..."
1,Starmer faces wave of anger over Labour confer...,"[Starmer, faces, wave, of, anger, over, Labour...",[Starmer faces wave of anger over Labour confe...,"[Starmer, faces, wave, anger, Labour, conferen..."
2,More ominous for the Labour Party,"[More, ominous, for, the, Labour, Party]",[More ominous for the Labour Party],"[More, ominous, Labour, Party]"
3,Labour Party conference should be in Cardiff o...,"[Labour, Party, conference, should, be, in, Ca...",[Labour Party conference should be in Cardiff ...,"[Labour, Party, conference, Cardiff, Birmingha..."
4,@julian97681683 @FigerRoll Ah the typical snee...,"[@, julian97681683, @, FigerRoll, Ah, the, typ...",[@julian97681683 @FigerRoll Ah the typical sne...,"[@, julian97681683, @, FigerRoll, Ah, typical,..."


In [86]:
# Stemming (reducing words to their stems)

pstem = PorterStemmer()

twt['word_stems_nostop'] = twt['word_token_proc'].apply(lambda x : [pstem.stem(y) for y in x])

twt.head(5)

Unnamed: 0,tweet,word_token,sentence_token,word_token_proc,word_stems_nostop
0,"Looking at the escalating chaos, it is hard to...","[Looking, at, the, escalating, chaos, ,, it, i...","[Looking at the escalating chaos, it is hard t...","[Looking, escalating, chaos, ,, hard, disagree...","[look, escal, chao, ,, hard, disagre, deputi, ..."
1,Starmer faces wave of anger over Labour confer...,"[Starmer, faces, wave, of, anger, over, Labour...",[Starmer faces wave of anger over Labour confe...,"[Starmer, faces, wave, anger, Labour, conferen...","[starmer, face, wave, anger, labour, confer, c..."
2,More ominous for the Labour Party,"[More, ominous, for, the, Labour, Party]",[More ominous for the Labour Party],"[More, ominous, Labour, Party]","[more, omin, labour, parti]"
3,Labour Party conference should be in Cardiff o...,"[Labour, Party, conference, should, be, in, Ca...",[Labour Party conference should be in Cardiff ...,"[Labour, Party, conference, Cardiff, Birmingha...","[labour, parti, confer, cardiff, birmingham, i..."
4,@julian97681683 @FigerRoll Ah the typical snee...,"[@, julian97681683, @, FigerRoll, Ah, the, typ...",[@julian97681683 @FigerRoll Ah the typical sne...,"[@, julian97681683, @, FigerRoll, Ah, typical,...","[@, julian97681683, @, figerrol, ah, typic, sn..."


In [87]:
# Speech tagging / POS tagging

tagged_text = nltk.pos_tag_sents(twt['word_token'])
twt['POS_Tag'] = tagged_texts

twt.head(3)


Unnamed: 0,tweet,word_token,sentence_token,word_token_proc,word_stems_nostop,POS_Tag
0,"Looking at the escalating chaos, it is hard to...","[Looking, at, the, escalating, chaos, ,, it, i...","[Looking at the escalating chaos, it is hard t...","[Looking, escalating, chaos, ,, hard, disagree...","[look, escal, chao, ,, hard, disagre, deputi, ...","[(Looking, VBG), (at, IN), (the, DT), (escalat..."
1,Starmer faces wave of anger over Labour confer...,"[Starmer, faces, wave, of, anger, over, Labour...",[Starmer faces wave of anger over Labour confe...,"[Starmer, faces, wave, anger, Labour, conferen...","[starmer, face, wave, anger, labour, confer, c...","[(Starmer, NNP), (faces, VBZ), (wave, NN), (of..."
2,More ominous for the Labour Party,"[More, ominous, for, the, Labour, Party]",[More ominous for the Labour Party],"[More, ominous, Labour, Party]","[more, omin, labour, parti]","[(More, RBR), (ominous, JJ), (for, IN), (the, ..."


# POS TAGGING DICTIONARY


Abbreviation 	Meaning
CC 	coordinating conjunction
CD 	cardinal digit
DT 	determiner
EX 	existential there
FW 	foreign word
IN 	preposition/subordinating conjunction
JJ 	This NLTK POS Tag is an adjective (large)
JJR 	adjective, comparative (larger)
JJS 	adjective, superlative (largest)
LS 	list market
MD 	modal (could, will)
NN 	noun, singular (cat, tree)
NNS 	noun plural (desks)
NNP 	proper noun, singular (sarah)
NNPS 	proper noun, plural (indians or americans)
PDT 	predeterminer (all, both, half)
POS 	possessive ending (parent\ ‘s)
PRP 	personal pronoun (hers, herself, him,himself)
PRP$ 	possessive pronoun (her, his, mine, my, our )
RB 	adverb (occasionally, swiftly)
RBR 	adverb, comparative (greater)
RBS 	adverb, superlative (biggest)
RP 	particle (about)
TO 	infinite marker (to)
UH 	interjection (goodbye)
VB 	verb (ask)
VBG 	verb gerund (judging)
VBD 	verb past tense (pleaded)
VBN 	verb past participle (reunified)
VBP 	verb, present tense not 3rd person singular(wrap)
VBZ 	verb, present tense with 3rd person singular (bases)
WDT 	wh-determiner (that, what)
WP 	wh- pronoun (who)
WRB 	wh- adverb (how) 

In [92]:
# Chunking using regular expression rules

chunkrule = r"""{chunkrule: {<NN?>*<VB?>*} """ # Regular expression meaning group nouns and verbs if >1 together

chunkparser = nltk.RegexpParser(chunkrule)

twt['Chunked_POSTag'] = twt['POS_Tag'].apply(chunkparser.parse)
twt.head(5)


Unnamed: 0,tweet,word_token,sentence_token,word_token_proc,word_stems_nostop,POS_Tag,Chunked_POSTag
0,"Looking at the escalating chaos, it is hard to...","[Looking, at, the, escalating, chaos, ,, it, i...","[Looking at the escalating chaos, it is hard t...","[Looking, escalating, chaos, ,, hard, disagree...","[look, escal, chao, ,, hard, disagre, deputi, ...","[(Looking, VBG), (at, IN), (the, DT), (escalat...","[(Looking, VBG), (at, IN), (the, DT), (escalat..."
1,Starmer faces wave of anger over Labour confer...,"[Starmer, faces, wave, of, anger, over, Labour...",[Starmer faces wave of anger over Labour confe...,"[Starmer, faces, wave, anger, Labour, conferen...","[starmer, face, wave, anger, labour, confer, c...","[(Starmer, NNP), (faces, VBZ), (wave, NN), (of...","[(Starmer, NNP), (faces, VBZ), [(wave, NN)], (..."
2,More ominous for the Labour Party,"[More, ominous, for, the, Labour, Party]",[More ominous for the Labour Party],"[More, ominous, Labour, Party]","[more, omin, labour, parti]","[(More, RBR), (ominous, JJ), (for, IN), (the, ...","[(More, RBR), (ominous, JJ), (for, IN), (the, ..."
3,Labour Party conference should be in Cardiff o...,"[Labour, Party, conference, should, be, in, Ca...",[Labour Party conference should be in Cardiff ...,"[Labour, Party, conference, Cardiff, Birmingha...","[labour, parti, confer, cardiff, birmingham, i...","[(Labour, NNP), (Party, NNP), (conference, NN)...","[(Labour, NNP), (Party, NNP), [(conference, NN..."
4,@julian97681683 @FigerRoll Ah the typical snee...,"[@, julian97681683, @, FigerRoll, Ah, the, typ...",[@julian97681683 @FigerRoll Ah the typical sne...,"[@, julian97681683, @, FigerRoll, Ah, typical,...","[@, julian97681683, @, figerrol, ah, typic, sn...","[(@, JJ), (julian97681683, NN), (@, NNP), (Fig...","[(@, JJ), [(julian97681683, NN)], (@, NNP), (F..."
