## Import Libraries

In [175]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import string
from nltk import word_tokenize, FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
import pickle

## Import Data

In [176]:
df = pd.read_excel('unlabelled_data.xlsx')

In [177]:
df.head()

Unnamed: 0.1,Unnamed: 0,Mobile Network,Tweet Created At,Tweet Text
0,0,@VodafoneUK,2019-12-04 08:05:14,@VodafoneUK Plus £2.28 package &amp; posting !...
1,1,@VodafoneUK,2019-12-04 08:04:05,I have repeatedly asked how to get a refund so...
2,2,@VodafoneUK,2019-12-04 08:01:19,"I have supplied visa details twice, I have bee..."
3,3,@VodafoneUK,2019-12-04 07:57:42,@VodafoneIN promised yesterday I’d receive no ...
4,4,@VodafoneUK,2019-12-04 07:57:16,@VodafoneUK you send texts about rewards - thi...


In [178]:
df.isna().sum()

Unnamed: 0          0
Mobile Network      0
Tweet Created At    0
Tweet Text          1
dtype: int64

In [179]:
df.dropna(inplace=True)

In [180]:
df.isna().sum()

Unnamed: 0          0
Mobile Network      0
Tweet Created At    0
Tweet Text          0
dtype: int64

## Create List of Stopwords

In [181]:
stopwords_list = stopwords.words('english')
stopwords_list += string.punctuation
stopwords_list += ["/n","''", '""', '...', '``',"'",'’','amp']

In [182]:
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Create Function to Tokenize, Remove Stopwords, Make Lower Case

In [183]:
def process_tweet(tweet):
    
    ## Remove "@username" from each Tweet
    pattern = '(\w*@\w*)'
    p = re.compile(pattern)
    tweet = p.sub('',tweet)
    
    ## Remove links from each Tweet
    pattern2 = '(\w*http\w*)'
    p = re.compile(pattern2)
    tweet = p.sub('',tweet)
    
    pattern3 = '(\w*//t.co/\w*)'
    p = re.compile(pattern3)
    tweet = p.sub('',tweet)
    
    ## Tokenize tweet
    tokens = nltk.word_tokenize(tweet)
    
    ## Retain only words that are not in the Stopwords list
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed

## Tokenize All Tweets

In [184]:
tokenized_tweets = []

for key, tweet in enumerate(list(df['Tweet Text'])):
    
    tokenized_tweets.append(process_tweet(tweet))
        

In [185]:
len(tokenized_tweets)

16144

In [186]:
tokenized_tweets[2]

['supplied',
 'visa',
 'details',
 'twice',
 'subjected',
 'horrendously',
 'rude',
 'staff',
 'instore',
 'vodafone',
 'stealing',
 'money',
 'removing',
 'services',
 'paid',
 'tourists',
 'use',
 'vodafone']

## Lemmatization

In [187]:
lemmatizer = WordNetLemmatizer()

In [188]:

lemmatized_tweets = []

for tweet in tokenized_tweets:
    
    lemmatized = []
    
    for word in tweet:
        
        lemmatized.append(lemmatizer.lemmatize(word))
    
    lemmatized_tweets.append(lemmatized)
        
        

## Save Tokenized/Lemmatized Tweets to Modified Dataframe

In [189]:
df['Cleaned Tweets'] = lemmatized_tweets

In [190]:
df.head()

Unnamed: 0.1,Unnamed: 0,Mobile Network,Tweet Created At,Tweet Text,Cleaned Tweets
0,0,@VodafoneUK,2019-12-04 08:05:14,@VodafoneUK Plus £2.28 package &amp; posting !...,"[plus, £2.28, package, posting]"
1,1,@VodafoneUK,2019-12-04 08:04:05,I have repeatedly asked how to get a refund so...,"[repeatedly, asked, get, refund, use, another,..."
2,2,@VodafoneUK,2019-12-04 08:01:19,"I have supplied visa details twice, I have bee...","[supplied, visa, detail, twice, subjected, hor..."
3,3,@VodafoneUK,2019-12-04 07:57:42,@VodafoneIN promised yesterday I’d receive no ...,"[promised, yesterday, receive, call, would, ge..."
4,4,@VodafoneUK,2019-12-04 07:57:16,@VodafoneUK you send texts about rewards - thi...,"[send, text, reward, morning, lindt, take, app..."


In [191]:
## Save using Pickle
df.to_pickle('cleaned_tweets')