
## Data Transformation (and futher cleaning)
#### Written by: Isobel Jones
#### Date: 14th of June

1. Read in cleaned data
2. Convert data in a tuple
3. Preprocess tweets
    - Lemmitise
    - Normalise
    - Remove Stop words
    - Remove hashtags
    - Remove RT and cc
    - Remove URLs
    - Remove mentions (GDPR compliant, this information also doesn’t add value to build sentiment analysis model.)
    - Remove punctuation
    - Tokenize text
    - Convert text to lower case
4. Write preprocessed tuples to serialized pickle file

In [1]:
import os
import re
import sys
import csv
import nltk
import emoji
import string
import pickle
import timeit
import tokenizer
import collections
from spellchecker import SpellChecker
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/isobeljones/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/isobeljones/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### 1. Read in cleaned data

In [4]:
##tweetxamp needs to be each individual tweet
# for x in range(0,len(Royalmaillist_df)-1):
#     for tag in listnames:
#        Royalmaillist_df.Tweet[x] = Royalmaillist_df.Tweet[x].replace(tag, gocept.pseudonymize.name(tag,'secret'))
# print (Royalmaillist_df.Tweet[1])

/Users/isobeljones/DataScience/automating-social-media


In [5]:
# load data from a file and append it to the rawData
def loadData(path):
    with open(path) as f:
        reader = csv.reader(f, delimiter=',')
        next(reader)
        for line in reader:
            (Id, Tweet, Label) = parseTweet(line)
            rawData.append((Id, Tweet, Label))
            preprocessedData.append((Id, preProcess(Tweet), Label))

In [3]:
def correctspelling(listofwords):
    correctspellinglist=[]
    spell = SpellChecker() 
    for word in listofwords:
        # Get the one `most likely` answer
        if word in spell.unknown(listofwords):
            correctspellinglist.append(spell.correction(word))
        else:
            correctspellinglist.append(word)
    return correctspellinglist
misspelled = ['something', 'is', 'hapenning', 'here']
correctspelling(misspelled)

['something', 'is', 'happening', 'here']

In [6]:
# Convert line from input file into an id/tweet/label tuple

def parseTweet(tweetLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    
    #Get the first column of data and convert to datatype int
    id = int(tweetLine[0])
    
    #Get the second column of data and convert to datatype string
    text = ''.join(tweetLine[1:2])
    
    #Get the third column of data and convert to datatype string
    label = ''.join(tweetLine[2:3])

    return (id , text, label)

In [7]:
stopwordsmodified = ['i',
# List of step words that I have identified through deep dive of end results                     
 'get','when','day', 'royal','get','doe','dm','say','u','amp','next','still','pm',
 'me','item','how','not','hi','still','got','th','letter','wa','may','po','need',
 'st','today','hear','check','could','update','even','sent','much','already','please',
 'help','people','saying','go','x','one','since',
 'one','somenone','way','local','raised','hour','get','yet','back','said','want','week'

# List of of generics out of the box stop words                    
 'put','know','would','think','ok','ago','getting','think','back'
 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", 
 "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 
 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 
 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was',
 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 
 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 
 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 
  'why',  'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 
 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y',
 'ain', 'aren', "aren't", 'couldn', "couldn't",  'doesn', "doesn't", 'hadn', "hadn't", 
  'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't",
 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 
 'won', "won't", 'wouldn', "wouldn't"]

In [8]:
# Text Preprocessing and feature vectorization
stopwordscached = stopwords.words('english')

# Input: a string of one review

def preProcess(tweet):
    
    #Clean tweets
    tweet = re.sub('http\S+\s*', '', tweet)  # remove URLs
    tweet = re.sub('RT|cc', '', tweet)  # remove RT and cc
    
    #Add spaces inbetween punctuation
    tweet = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", tweet)
    tweet = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", tweet)
    
    # normalisation
    tweet = re.sub(r"(\S)\1\1+",r"\1\1\1", tweet)
    #tweet = re.sub('#\S+', '', tweet)  # remove hashtags
    #tweet = re.sub('@\S+', '', tweet)  # remove mentions (Make GDPR Compliant)
    tweet = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', tweet)  # remove punctuations
    tweet = re.sub(r"[a-zA-Z]{1,2}\d+[a-zA-Z]{1,4}",r"tracking_number", tweet) # replace tracking numbers witht the string 'tracking_number'
    tweet = re.sub("\d+", "", tweet) # Remove numbers (but do not remove numbers in words)
    tweet = re.sub(r'(.)\1+', r'\1\1', tweet) #Word Lengthening, Remove redundant letters e.g heyyyyyyaaaa to heyyaa
    tweet = re.sub('[rR]oyal[mM]ail', '', tweet)  # remove royalmail
    
    
    #Stemming
    snowball_stemmer = SnowballStemmer("english")
    tweet = snowball_stemmer.stem(tweet)
    
    #Split text and make into tokens
    #tweet = re.split("\s+",tweet)
    
    #Use a specific Twitter Tokenizer
    from nltk.tokenize import TweetTokenizer
    T = TweetTokenizer()
    tweet = T.tokenize(tweet)
    
    #All iterables '' that are false (None) will be removed.
    tweet = filter(None, tweet)
    
    #lemitisation
    wnl = WordNetLemmatizer()
    tweet =[wnl.lemmatize(i) for i in tweet]
    

    #All word to lowercase
    tweet = [t.lower() for t in tweet]
    
    #Remove stopwords
    tweet = [word for word in tweet if word not in stopwordsmodified]

    
    # Spell correct words
    #tweet = correctspelling(tweet) # Accuracy is less when correcting incorrect words
    
    return tweet

In [9]:
rawData = []          # the filtered data from the dataset file (should be 20315 samples)
preprocessedData = [] # the preprocessed reviews ( to see how the preprocessing is doing)

TweetPath = cwd + '/data/cleanedTweets.csv'

loadData(TweetPath) # Run the Data Transformation on the cleaned tweets

In [10]:
def countNumberOfUniqueWordsRAW(data):
    totalTokens=[]
    for a in rawData:
        totalTokens.append(a[1])
    totalWordsRawData = sum(collections.Counter(' '.join(totalTokens).split()).values())
    print('Number of unique tokens without transformation', totalWordsRawData)
    
def countNumberOfUniqueWordsPROCESSED(data):
    totalTokens=[]
    for a in data:
        totalTokens = totalTokens + a[1]
    print('Number of unique tokens with transformation', len(set(totalTokens)))
    
def countUniqueTokens(rawData,preprocessedData):
    countNumberOfUniqueWordsRAW(rawData)
    countNumberOfUniqueWordsPROCESSED(preprocessedData)
    
countUniqueTokens(rawData,preprocessedData)

Number of unique tokens without transformation 450794
Number of unique tokens with transformation 20169


In [11]:
file_name = cwd +'/data/GDPRcompliantTweets.dump'

# open the file for writing
fileObject = open(file_name,'wb')

# this writes the object preprocessedData to the
# file named 'GDPRcompliantTweets.csv'
pickle.dump(preprocessedData,fileObject)

# here we close the fileObject
fileObject.close()

In [12]:
tweet = 'The tracking number is XH8977647697GB'
tweet = re.sub(r"[a-zA-Z]{1,2}\d+[a-zA-Z]{1,4}",r"tracking_number", tweet) # replace tracking numbers witht the string 'tracking_number'
tweet

'The tracking number is tracking_number'

Check data still looks okay (emoji's are still there, spellings look normal)