# Natural Language Processing - Language Use

A notebook looking at word use differences between political parties.

Going to compare the word frequencies of the two main political party's tweets.

In [66]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import string
import re

In [7]:
df = pd.read_csv('../data/processed_tweets.csv')

In [9]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,user_name,constituency,party,gender,tweet_id,permalink,text,date,retweets,favourites,replies,party_new
0,skinnock,Aberavon,Labour,Male,947017416047058944,https://twitter.com/SKinnock/status/9470174160...,Devastating resignation letter from Andrew Ado...,2017-12-30 08:12:01,17,43,11,Labour
1,skinnock,Aberavon,Labour,Male,944285195372548097,https://twitter.com/SKinnock/status/9442851953...,The gov need to act to allow more flexibility ...,2017-12-22 19:15:09,2,5,2,Labour
2,skinnock,Aberavon,Labour,Male,943841995390423040,https://twitter.com/SKinnock/status/9438419953...,Here are the fabled sector analyses. Damp squi...,2017-12-21 13:54:02,21,29,8,Labour
3,skinnock,Aberavon,Labour,Male,943595206225559552,https://twitter.com/SKinnock/status/9435952062...,"Fallon, Patel and now Green. Three strikes and...",2017-12-20 21:33:22,7,31,5,Labour
4,skinnock,Aberavon,Labour,Male,943552279189258245,https://twitter.com/SKinnock/status/9435522791...,Waiting to hear from @DavidGauke re my concern...,2017-12-20 18:42:48,3,1,1,Labour


In [95]:
df[['text']].isna()['text'].value_counts()

False    2716793
True        1210
Name: text, dtype: int64

In [97]:
# Remove null text tweets

df = df[pd.notna(df['text']) == True]

In [101]:
#Process text - this takes a while

stop_words = stopwords.words('english')

punc = string.punctuation

def process_text(row, stem=False):
    """
    Remove URLs, mentions, hashtags
    
    Remove stop words
    
    Remove punctuation
    
    """
    string = row['text'] 
    
    
    string = re.sub(r'(http|pic.).*$', '', string)
    string = re.sub(r'@\S*\b', '', string)
    string = re.sub(r'#', '', string)       

    string = string.lower()

    words = string.split()
    
    words = [word for word in words if word not in stop_words]
    
    string = ' '.join(words)
    
    words = word_tokenize(string)
    
    words = [word for word in words if word not in punc]
    
    if stem == True:
        
        words = [SnowballStemmer("english").stem(word) for word in words]
    
    return ' '.join(words)
    

In [105]:
test = {'text': "The quick brown fox doesn't jump over the lazy dogs!" }

In [106]:
process_text(test)

'quick brown fox jump lazy dogs'

In [107]:
df['processed_text'] = df.apply(process_text, axis=1)

In [108]:
df.head()

Unnamed: 0,user_name,constituency,party,gender,tweet_id,permalink,text,date,retweets,favourites,replies,party_new,processed_text
0,skinnock,Aberavon,Labour,Male,947017416047058944,https://twitter.com/SKinnock/status/9470174160...,Devastating resignation letter from Andrew Ado...,2017-12-30 08:12:01,17,43,11,Labour,devastating resignation letter andrew adonis r...
1,skinnock,Aberavon,Labour,Male,944285195372548097,https://twitter.com/SKinnock/status/9442851953...,The gov need to act to allow more flexibility ...,2017-12-22 19:15:09,2,5,2,Labour,gov need act allow flexibility give bsps trust...
2,skinnock,Aberavon,Labour,Male,943841995390423040,https://twitter.com/SKinnock/status/9438419953...,Here are the fabled sector analyses. Damp squi...,2017-12-21 13:54:02,21,29,8,Labour,fabled sector analyses damp squib idea cld giv...
3,skinnock,Aberavon,Labour,Male,943595206225559552,https://twitter.com/SKinnock/status/9435952062...,"Fallon, Patel and now Green. Three strikes and...",2017-12-20 21:33:22,7,31,5,Labour,fallon patel green three strikes you’re out th...
4,skinnock,Aberavon,Labour,Male,943552279189258245,https://twitter.com/SKinnock/status/9435522791...,Waiting to hear from @DavidGauke re my concern...,2017-12-20 18:42:48,3,1,1,Labour,waiting hear concerns abt bsps transfer defaul...


In [109]:
# Split into two dataframes based on party

labour_df = df[df['party_new'] == 'Labour']

conservative_df = df[df['party_new'] == 'Conservative']

In [110]:
# Loop through the dataframe and make an ultra long list of all the words - Conservative

conservative_corpus = []

for index, row in conservative_df.iterrows():
    
    text = row['text']
    
    conservative_corpus = conservative_corpus +word_tokenize(text)

In [111]:
# Loop through the dataframe and make an ultra long list of all the words - Labour

labour_corpus = []

for index, row in labour_df.iterrows():
    
    text = row['text']
    
    labour_corpus = labour_corpus + (word_tokenize(text))

In [None]:
conservative_freq = nltk.FreqDist(conservative_corpus)
labour_freq = nltk.FreqDist(labour_corpus)