# Natural Language Processing - Language Use

A notebook looking at word use differences between political parties.

Going to compare the word frequencies of the two main political party's tweets.

In [93]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import string
import re
import pickle

In [94]:
df = pd.read_csv('../data/processed_tweets.csv')

In [95]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [96]:
df.head()

Unnamed: 0,user_name,constituency,party,gender,tweet_id,permalink,text,date,retweets,favourites,replies,party_new
0,skinnock,Aberavon,Labour,Male,947017416047058944,https://twitter.com/SKinnock/status/9470174160...,Devastating resignation letter from Andrew Ado...,2017-12-30 08:12:01,17,43,11,Labour
1,skinnock,Aberavon,Labour,Male,944285195372548097,https://twitter.com/SKinnock/status/9442851953...,The gov need to act to allow more flexibility ...,2017-12-22 19:15:09,2,5,2,Labour
2,skinnock,Aberavon,Labour,Male,943841995390423040,https://twitter.com/SKinnock/status/9438419953...,Here are the fabled sector analyses. Damp squi...,2017-12-21 13:54:02,21,29,8,Labour
3,skinnock,Aberavon,Labour,Male,943595206225559552,https://twitter.com/SKinnock/status/9435952062...,"Fallon, Patel and now Green. Three strikes and...",2017-12-20 21:33:22,7,31,5,Labour
4,skinnock,Aberavon,Labour,Male,943552279189258245,https://twitter.com/SKinnock/status/9435522791...,Waiting to hear from @DavidGauke re my concern...,2017-12-20 18:42:48,3,1,1,Labour


In [97]:
df[['text']].isna()['text'].value_counts()

False    2716793
True        1210
Name: text, dtype: int64

In [98]:
# Remove null text tweets

df = df[pd.notna(df['text']) == True]

In [99]:
#Process text - this takes a while

stop_words = stopwords.words('english')

stop_words = stop_words

punc = list(string.punctuation) + ['...','``',"''","'m", "'s",'rt',"'ve", 'it', 'i']+stop_words

def process_text(row, stem=False):
    """
    Remove URLs, mentions, hashtags
    
    Remove stop words
    
    Remove punctuation
    
    """
    string = row['text'] 
    
    
    string = re.sub(r'(http|pic.).*$', '', string)
    string = re.sub(r'@\S*\b', '', string)
    string = re.sub(r'#', '', string)       

    string = string.lower()

    words = string.split()
    
    words = [word for word in words if word not in stop_words]
    
    string = ' '.join(words)
    
    words = word_tokenize(string)
    
    words = [word for word in words if word not in punc]
    
    if stem == True:
        
        words = [SnowballStemmer("english").stem(word) for word in words]
    
    return ' '.join(words)
    

In [100]:
test = {'text': "The quick brown fox doesn't jump over the lazy dogs! dog's" }

In [101]:
process_text(test)

'quick brown fox jump lazy dogs dog'

In [102]:
df['processed_text'] = df.apply(process_text, axis=1)

In [103]:
df.head()

Unnamed: 0,user_name,constituency,party,gender,tweet_id,permalink,text,date,retweets,favourites,replies,party_new,processed_text
0,skinnock,Aberavon,Labour,Male,947017416047058944,https://twitter.com/SKinnock/status/9470174160...,Devastating resignation letter from Andrew Ado...,2017-12-30 08:12:01,17,43,11,Labour,devastating resignation letter andrew adonis r...
1,skinnock,Aberavon,Labour,Male,944285195372548097,https://twitter.com/SKinnock/status/9442851953...,The gov need to act to allow more flexibility ...,2017-12-22 19:15:09,2,5,2,Labour,gov need act allow flexibility give bsps trust...
2,skinnock,Aberavon,Labour,Male,943841995390423040,https://twitter.com/SKinnock/status/9438419953...,Here are the fabled sector analyses. Damp squi...,2017-12-21 13:54:02,21,29,8,Labour,fabled sector analyses damp squib idea cld giv...
3,skinnock,Aberavon,Labour,Male,943595206225559552,https://twitter.com/SKinnock/status/9435952062...,"Fallon, Patel and now Green. Three strikes and...",2017-12-20 21:33:22,7,31,5,Labour,fallon patel green three strikes you’re theresa
4,skinnock,Aberavon,Labour,Male,943552279189258245,https://twitter.com/SKinnock/status/9435522791...,Waiting to hear from @DavidGauke re my concern...,2017-12-20 18:42:48,3,1,1,Labour,waiting hear concerns abt bsps transfer defaul...


In [104]:
# Split into two dataframes based on party

labour_df = df[df['party_new'] == 'Labour']

conservative_df = df[df['party_new'] == 'Conservative']

print ('Done')

Done


In [105]:
# Loop through the dataframe and make an ultra long list of all the words - Conservative

conservative_corpus = []

for index, row in conservative_df.iterrows():
    
    text = row['processed_text']
    
    conservative_corpus.extend(word_tokenize(text))

In [106]:
# Loop through the dataframe and make an ultra long list of all the words - Labour

labour_corpus = []

for index, row in labour_df.iterrows():
    
    text = row['processed_text']
    
    labour_corpus.extend(word_tokenize(text))

In [107]:
conservative_freq = nltk.FreqDist(conservative_corpus)
labour_freq = nltk.FreqDist(labour_corpus)

In [108]:
pickle.dump(conservative_freq, open('conservative_corpus.p', 'wb' ))
pickle.dump(labour_freq, open('labour_corpus.p', 'wb' ))

In [109]:
# Most common Conservative words

conservative_freq.most_common(50)

[('great', 56017),
 ('today', 48220),
 ('good', 42594),
 ('new', 33392),
 ('thanks', 32653),
 ('see', 30371),
 ('people', 24703),
 ('thank', 23589),
 ('uk', 22369),
 ('local', 22190),
 ('day', 21859),
 ('labour', 21549),
 ('work', 20660),
 ('support', 20597),
 ('well', 20508),
 ('meeting', 20024),
 ('get', 18908),
 ('news', 18608),
 ('would', 18271),
 ('morning', 18169),
 ('one', 17873),
 ('time', 17414),
 ('last', 16899),
 ('eu', 16839),
 ('mp', 15060),
 ('need', 14451),
 ('like', 13974),
 ('many', 13947),
 ('year', 13935),
 ('please', 13900),
 ('help', 13777),
 ('back', 13640),
 ('looking', 13482),
 ('done', 13465),
 ('forward', 13253),
 ('debate', 13031),
 ('vote', 12967),
 ('week', 12799),
 ('parliament', 12143),
 ('really', 11899),
 ('think', 11853),
 ('much', 11783),
 ('team', 11557),
 ('email', 11499),
 ('know', 11493),
 ('business', 11419),
 ('first', 11140),
 ('visit', 11053),
 ('best', 11025),
 ('meet', 10917)]

In [110]:
labour_freq.most_common(50)

[('great', 81864),
 ('thanks', 80816),
 ('labour', 73910),
 ('today', 71024),
 ('good', 67293),
 ('people', 56127),
 ('see', 47245),
 ('get', 44510),
 ('new', 42177),
 ('well', 40316),
 ('support', 38808),
 ('one', 37791),
 ('time', 36486),
 ('day', 35541),
 ('think', 35344),
 ('thank', 34985),
 ('need', 34104),
 ('like', 33975),
 ('tory', 33774),
 ('work', 33611),
 ('would', 33215),
 ('know', 31931),
 ('govt', 30270),
 ('yes', 28917),
 ('tories', 28745),
 ('really', 27788),
 ('vote', 27226),
 ('back', 27027),
 ('last', 26943),
 ('local', 26199),
 ('us', 26153),
 ('debate', 25797),
 ('campaign', 25339),
 ('hope', 25245),
 ('many', 24961),
 ('nhs', 24852),
 ('done', 24483),
 ('government', 24337),
 ('mp', 24243),
 ('uk', 24038),
 ("'ll", 23893),
 ('help', 23682),
 ('much', 23617),
 ("n't", 23356),
 ('party', 23068),
 ('please', 22493),
 ('make', 22267),
 ('morning', 21658),
 ('meeting', 21590),
 ('right', 21505)]