In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [5]:
# Import the tweets dataset, this is a cut down version of the full tweet 
# dataset per the operations performed in the file "eda-rea-v-liv-2018"
en_tweets_df = pd.read_csv('en_tweets_df.csv', lineterminator='\n')

In [11]:
en_tweets_df.head()

Unnamed: 0.1,Unnamed: 0,id,text,lang,retweeted_status,created_at,user,tweet_text,is_retweet,created_at_hour_minute
0,0,1.000366e+18,MATCH-DAY\n\nReal Madrid vs Liverpool\n\n#UCLF...,en,,2018-05-26 13:18:30+00:00,"{'id': 2846595478, 'id_str': '2846595478', 'na...",MATCH-DAY\n\nReal Madrid vs Liverpool\n\n#UCLF...,False,2018-05-26 13:18:00+00:00
1,4,1.000366e+18,RT @ECG_Unofficial: We will like to categorica...,en,{'created_at': 'Sat May 26 11:14:44 +0000 2018...,2018-05-26 13:18:31+00:00,"{'id': 902735000445095938, 'id_str': '90273500...",RT @ECG_Unofficial: We will like to categorica...,True,2018-05-26 13:18:00+00:00
2,5,1.000366e+18,Real Madrid.... LETS GO!!!!!!!!!!!!!!!!!!!!!!!...,en,,2018-05-26 13:18:31+00:00,"{'id': 240672622, 'id_str': '240672622', 'name...",Real Madrid.... LETS GO!!!!!!!!!!!!!!!!!!!!!!!...,False,2018-05-26 13:18:00+00:00
3,7,1.000366e+18,RT @YNWA_Claire: Please? 🙏🏼\n\n#UCLFinal https...,en,{'created_at': 'Sat May 26 13:13:45 +0000 2018...,2018-05-26 13:18:31+00:00,"{'id': 277019564, 'id_str': '277019564', 'name...",RT @YNWA_Claire: Please? 🙏🏼\n\n#UCLFinal https...,True,2018-05-26 13:18:00+00:00
4,8,1.000366e+18,RT @ChampionsLeague: Two European giants go he...,en,{'created_at': 'Sat May 26 13:00:01 +0000 2018...,2018-05-26 13:18:31+00:00,"{'id': 957644286350315521, 'id_str': '95764428...",RT @ChampionsLeague: Two European giants go he...,True,2018-05-26 13:18:00+00:00


In [73]:
# Import NLTK libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/jamie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/jamie/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Tokenization <a class="anchor" id="tokenization"></a>

### Word Tokenization <a class="anchor" id="word-tokenization"></a>

In [81]:
del SAMPLE_en_tweets_df

In [89]:
# Create a function that returns clean tokenized words for a tweet
english_stopwords = stopwords.words('english')

# Context specific stop words (refer "most common words" section below for identification approach

# rt = short for retweet, this does not provide any insights and a column already exists to identify retweets
# http & https = the start of web links these provide little value as "words", future work: these could be
# to build a feature along thelines of "Contains Web Link?"
# uclfinal, championsleague, championsleaguefinal = "hashtag"/topical words, given the original tweet dataset
# contained only tweets that had a hashtag of uclfinal these words do not add value to the analysis
custom_stopwords = ['rt', 'http', 'https', 'uclfinal', 'championsleague', 'championsleaguefinal']

# Combine the two stop words lists
stop_words = english_stopwords + custom_stopwords

def CleanTokenizedWords(tweet):
    word_tokenized = word_tokenize(tweet)

    # lowercasing
    cleaned_word_tokenized = [word.lower().strip() for word in word_tokenized]
    
    # replacing some unwanted things
    cleaned_word_tokenized = [word.replace('(','').replace(')','') for word in cleaned_word_tokenized if word.isalpha()]
    
    # removing stopwords
    cleaned_word_tokenized = [word for word in cleaned_word_tokenized if word not in stop_words]
    
    return cleaned_word_tokenized

In [None]:
# Tokenize words in tweets
en_tweets_df['tokenized_words'] = en_tweets_df.apply(lambda row: CleanTokenizedWords(row['tweet_text']), axis=1)

### Most Common Words <a class="anchor" id="most-common-words"></a>

In [87]:
# There are three key benefits to finding the most common words:
    # 1. Further refinements could be made to CleanTokenizedWords in terms of words to exclude
    # 2. We can obtain further insights into the data
    # 3. Can select key words that could be used to generate features    

# Convert tokenized words column into a single list of words
words_list = en_tweets_df['tokenized_words'].values.tolist()

# Flatten the list
flattened_words_list = [j for sub in words_list for j in sub]

# Find the most common words
fdist = FreqDist(x.lower() for x in flattened_words_list)
fdist.most_common(100)

[('uclfinal', 457),
 ('https', 276),
 ('rt', 251),
 ('lfc', 121),
 ('liverpool', 103),
 ('come', 89),
 ('reds', 80),
 ('madrid', 70),
 ('real', 65),
 ('championsleague', 58),
 ('time', 43),
 ('live', 38),
 ('go', 35),
 ('halamadrid', 32),
 ('ynwa', 30),
 ('final', 29),
 ('game', 28),
 ('realmadrid', 25),
 ('allez', 23),
 ('vs', 22),
 ('let', 22),
 ('kick', 21),
 ('turn', 20),
 ('stream', 19),
 ('championsleaguefinal', 19),
 ('anfield', 19),
 ('believers', 19),
 ('rocking', 18),
 ('sound', 18),
 ('watch', 17),
 ('win', 17),
 ('kiev', 17),
 ('underway', 17),
 ('v', 16),
 ('tonight', 16),
 ('ronaldo', 16),
 ('dua', 16),
 ('gt', 16),
 ('salah', 15),
 ('league', 15),
 ('ucl', 14),
 ('lipa', 14),
 ('day', 13),
 ('football', 13),
 ('champions', 13),
 ('mobil', 13),
 ('team', 12),
 ('walk', 12),
 ('rmaliv', 12),
 ('good', 10),
 ('like', 9),
 ('never', 9),
 ('alone', 9),
 ('one', 9),
 ('uefa', 9),
 ('cristiano', 9),
 ('last', 8),
 ('performing', 8),
 ('mo', 8),
 ('first', 7),
 ('biggest', 7),
 

In [None]:
###############################################################
###############################################################
###############################################################
## SCRAP ######################################################
###############################################################

In [49]:
from collections import Counter

# word_tokenized = word_tokenize()
bow = Counter(en_tweets_df['tokenized_words'][0])
print(bow.most_common())

[(':', 3), ('#', 2), ('https', 2), ('MATCH-DAY', 1), ('Real', 1), ('Madrid', 1), ('vs', 1), ('Liverpool', 1), ('UCLFinal', 1), ('LFC', 1), ('Free', 1), ('Live', 1), ('Stream', 1), ('HD', 1), ('Here', 1), ('//t.co/PHAepWsA6o', 1), ('//t.co/T6mWNz14lb', 1)]


In [50]:
bow

Counter({'MATCH-DAY': 1,
         'Real': 1,
         'Madrid': 1,
         'vs': 1,
         'Liverpool': 1,
         '#': 2,
         'UCLFinal': 1,
         'LFC': 1,
         'Free': 1,
         'Live': 1,
         'Stream': 1,
         'HD': 1,
         'Here': 1,
         ':': 3,
         'https': 2,
         '//t.co/PHAepWsA6o': 1,
         '//t.co/T6mWNz14lb': 1})