In [1]:
#importing libraries
import nltk
import pandas as pd
#from emoticons import EmoticonDetector
import re as regex
import collections
import numpy as np
import plotly
from plotly import graph_objs
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from time import time
import gensim

#plotly configuration
plotly.offline.init_notebook_mode(connected=True)


detected Windows; aliasing chunkize to chunkize_serial



In [2]:
class TwitterData_Initialize():
    data = []
    processed_Traindata = []
    processed_Testdata = []
    wordlist = []
    
    data_model = None
    data_labels = None
    
    def initialize(self, csv_file, from_cached=None):
        if from_cached is not None:
            self.data_model = pd.read_csv(from_cached)
            return
        
        self.data = pd.read_csv(csv_file, usecols=[0,1,5,10,15])
        train, test = train_test_split(self.data, test_size=0.2)
        self.processed_Traindata = train
        self.processed_Testdata = test
        self.wordlist = []
        self.data_model = None
        self.data_labels = None

In [3]:
data = TwitterData_Initialize()
data.initialize("../data/Tweets.csv")
data.processed_Traindata

Unnamed: 0,tweet_id,airline_sentiment,airline,text,topic_label
11944,5.702890e+17,negative,American,@AmericanAir - Please find my bag!! In Singapo...,4
5570,5.688890e+17,neutral,Southwest,@SouthwestAir please....can I have the last ti...,5
10657,5.690000e+17,negative,US Airways,"@USAirways It says to call. Before connecting,...",8
6302,5.680720e+17,neutral,Southwest,"@SouthwestAir Ahhhh! Sorry, just followed.",8
10722,5.689400e+17,negative,US Airways,@USAirways / @AmericanAir are incompetent.,0
10898,5.687930e+17,neutral,US Airways,@USAirways Thank you for valuing my feedback. ...,0
6908,5.700410e+17,neutral,Delta,@JetBlue Are all of your flights out of Charle...,8
8172,5.686360e+17,negative,Delta,@JetBlue I'm sitting on the plane. Too many pr...,8
1118,5.699220e+17,negative,United,@united there are a lot of unhappy cold people...,2
4545,5.700420e+17,positive,Southwest,@SouthwestAir crew on flight 206 is awesome! T...,0


In [4]:
#Data Preprocessing
class DataPreprocessing:
    def iterate(self):
        for preprocessingMethod in [self.removeUrls,
                                   self.removeUsernames,
                                   self.removeElongatedWords,
                                   self.removeNa,
                                   self.replaceSlangWords,
                                   self.removeSpecialChars,
                                   self.removeNumbers]:
            yield preprocessingMethod
    
    @staticmethod
    def removeByRegex(tweets, regExp):
        tweets.loc[:, "text"].replace(regExp, "", inplace=True)
        return tweets
    
    def removeUrls(self, tweets):
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))
    
    def removeNa(self, tweets):
        return tweets[tweets["text"] != ""]
    
    def removeSpecialChars(self, tweets):
        for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$"
                                                                    "@", "%", "^", "*", "(", ")", "{", "}",
                                                                    "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                    "!", "?", ".", "'",
                                                                    "--", "---", "#"]):
            tweets.loc[:, "text"].replace(remove, "", inplace=True)
        return tweets
    
    def removeUsernames(self, tweets):
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"@[^\s]+[\s]?"))
    
    def removeElongatedWords(self, tweets):
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"(.)\1+', r'\1\1"))
    
    def removeNumbers(self, tweets):
        #print(tweets)
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))
    
    def replaceSlangWords(self, tweets):
        with open('../data/slang.txt') as file:
            slang_map = dict(map(str.strip, line.partition('\t')[::2])
            for line in file if line.strip())
            #print(tweets["text"])
            #print("-----------------------------------------END")
            for index,word in tweets['text'].iteritems():
                #print(index)
                for i in word.split():
                    isUpperCase = i.isupper()
                    i = i.lower()
                    if i in slang_map.keys():
                        word = word.replace(i, slang_map[i])
                        tweets.loc[(index),"text"] = word
                if isUpperCase:
                    i = i.upper()
        #print(tweets.loc[:,"text"])
        return tweets

In [5]:
#Cleaning the Training Data
class CleanTrainingData(TwitterData_Initialize):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
        #self.processed_Testdata = previous.processed_Testdata
    
    def cleaningData(self, cleaner):
        train = self.processed_Traindata
        #test = self.processed_Testdata
        
        for cleanerMethod in cleaner.iterate():
            train = cleanerMethod(train)
            #test = cleanerMethod(test)
        self.processed_Traindata = train
        #self.processed_Testdata = test

In [6]:
data = CleanTrainingData(data)
data.cleaningData(DataPreprocessing())
data.processed_Traindata.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline,text,topic_label
11944,5.70289e+17,negative,American,Please find my bag In Singapore for three day...,4
5570,5.68889e+17,neutral,Southwest,pleasecan I have the last tickets for me and m...,5
10657,5.69e+17,negative,US Airways,It says to call Before connecting get song dan...,8
6302,5.68072e+17,neutral,Southwest,Ahhhh Sorry just followed,8
10722,5.6894e+17,negative,US Airways,are incompetent,0


In [355]:
#Tokenizing and Stemming the data
class TokenizationStemming(CleanTrainingData):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
        #self.processed_TestData = previous.processed_TestData
    
    def stem(self, stemmer = nltk.PorterStemmer()):
        def stemJoin(row):
            row["text"] = list(map(lambda str: stemmer.stem(str.lower()), row["text"]))
            return row
    
        self.processed_Traindata = self.processed_Traindata.apply(stemJoin, axis=1)
    
    def tokenize(self, tokenizer = nltk.word_tokenize):
        def tokenizeRow(row):
            row["text"] = tokenizer(row["text"])
            row["tokenizedText"] = [] + row["text"]
            return row
        
        self.processed_Traindata = self.processed_Traindata.apply(tokenizeRow, axis=1)

In [356]:
data = TokenizationStemming(data)
data.tokenize()
data.stem()
data.processed_Traindata.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,text,topic_label,tokenizedText
2946,5.68819e+17,negative,United,"[i, wa, not, look, for, the, fare, to, be, ret...",1,"[I, was, not, looking, for, the, fare, to, be,..."
4225,5.67767e+17,negative,United,"[it, and, no, power, outlet, at, the, seat, on...",8,"[its, and, no, power, outlets, at, the, seats,..."
3895,5.68038e+17,positive,United,"[sorri, to, hear, outsourc, plan, bois, is, be...",7,"[sorry, to, hear, outsourcing, plan, Boise, is..."
14373,5.69625e+17,negative,American,"[that, unaccept, they, should, allow, me, to, ...",1,"[thats, unacceptable, They, should, allow, me,..."
9961,5.696e+17,negative,US Airways,"[it, pretti, ridicul, that, at, phx, sky, harb...",4,"[Its, pretty, ridiculous, that, at, PHX, sky, ..."


In [357]:
#Building Wordlist
#Un-filtered version without removing stopwords
words = collections.Counter()
for idx in data.processed_Traindata.index:
    words.update(data.processed_Traindata.loc[idx, "text"])

words.most_common(5)

[('to', 6898), ('the', 4837), ('i', 4340), ('flight', 3822), ('a', 3557)]

In [358]:
#Removing stopwords
stopwords = nltk.corpus.stopwords.words("english")
whitelist = ["n't", "not"]
for idx, stop_word in enumerate(stopwords):
    if stop_word not in whitelist:
        del words[stop_word]

words.most_common(5)

[('flight', 3822), ('thank', 1367), ('wa', 1287), ('get', 1287), ('not', 1284)]

In [359]:
#Generating the final wordlist
class WordList(TokenizationStemming):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
    
    whitelist = ["n't", "not"]
    wordlist = []
    
    def buildWordlist(self, min_occurrences=3, max_occurences=3000, stopwords=nltk.corpus.stopwords.words("english"),
                     whitelist=None):
        self.wordlist = []
        whitelist = self.whitelist if whitelist is None else whitelist
        import os
        if os.path.isfile('../data/wordlist.csv'):
            word_df = pd.read_csv('../data/wordlist.csv', encoding = "ISO-8859-1")
            word_df = word_df[word_df["occurrences"] > min_occurrences]
            self.wordlist = list(word_df.loc[:, "word"])
            return
        words = collections.Counter()
        for idx in self.processed_Traindata.index:
            words.update(self.processed_Traindata.loc[idx, "text"])
        
        for idx, stop_word in enumerate(stopwords):
            if stop_word not in whitelist:
                del words[stop_word]
        
        word_df = pd.DataFrame(data={"word" : [k for k, v in words.most_common() if min_occurrences < v < max_occurences],
                                    "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]},
                              columns = ["word", "occurrences"])
        
        word_df.to_csv("../data/wordlist.csv", index_label="idx", encoding = "utf8")
        self.wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences]
    

In [360]:
data = WordList(data)
data.buildWordlist()

In [361]:
#Transforming into Bag-of-Words
class BagOfWords(WordList):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
        self.wordlist = previous.wordlist
    
    def buildDataModel(self):
        labelColumn = ["label"]
        columns = labelColumn + list(
            map(lambda w: w + "_bow", self.wordlist))
        labels = []
        rows = []
        
        for idx in self.processed_Traindata.index:
            currentRow = []
            currentLabel = self.processed_Traindata.loc[idx, "airline_sentiment"]
            labels.append(currentLabel)
            currentRow.append(currentLabel)
            
            tokens = set(self.processed_Traindata.loc[idx, "text"])
            for _, word in enumerate(self.wordlist):
                currentRow.append(1 if word in tokens else 0)
            
            rows.append(currentRow)
        
        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        
        return self.data_model, self.data_labels

In [381]:
data = BagOfWords(data)
bow, labels = data.buildDataModel()
bow.head(5)

Unnamed: 0,label,hour_bow,cancel_bow,help_bow,servic_bow,delay_bow,time_bow,custom_bow,bag_bow,call_bow,...,gorgeou_bow,woohoo_bow,thousand_bow,understat_bow,furiou_bow,manual_bow,smell_bow,ber_bow,charleston_bow,nrt_bow
0,negative,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,neutral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,neutral,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,positive,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,negative,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [363]:
import random
seed = 666
random.seed(seed)

In [364]:
#Utility function to train the classifier and show F1, Precision, recall and Accuracy values

def test_classifier(X_train, y_train, X_test, y_test, classifier):
    log("")
    log("==================================================")
    classifier_name = str(type(classifier).__name__)
    log("Testing " + classifier_name)
    now = time()
    list_of_labels = sorted(list(set(y_train)))
    model = classifier.fit(X_train, y_train)
    log("Learning time {0}s".format(time() - now))
    now = time()
    predictions = model.predict(X_test)
    log("Predicting time {0}s".format(time() - now))
    
    precision = precision_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    recall = recall_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    log("==================Results=======================")
    log("            Negative     Neutral    Positive")
    log("F1         " + str(f1))
    log("Precision  " + str(precision))
    log("Recall     " + str(recall))
    log("Accuracy   " + str(accuracy))
    log("================================================")
    
    return precision, recall, accuracy, f1

def log(x):
    print(x)
    

In [365]:
#Classifier : BagOfWords + NaiveBayes
from sklearn.naive_bayes import BernoulliNB

X_train, X_test, y_train, y_test = train_test_split(bow.iloc[:, 1:], bow.iloc[:, 0],
                                                   train_size = 0.7, stratify=bow.iloc[:, 0],
                                                   random_state = seed)

precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, BernoulliNB())


From version 0.21, test_size will always complement train_size unless both are specified.




Testing BernoulliNB
Learning time 0.8397293090820312s
Predicting time 0.2986288070678711s
            Negative     Neutral    Positive
F1         [0.8519774  0.57836066 0.58070501]
Precision  [0.84340045 0.57198444 0.61614173]
Recall     [0.86073059 0.58488064 0.54912281]
Accuracy   0.750996015936255


In [366]:
#NaiveBayes with 8 fold cross-validation

def cv(classifier, X_train, y_train):
    log("===============================================")
    classifier_name = str(type(classifier).__name__)
    now = time()
    log("Crossvalidating " + classifier_name + "...")
    accuracy = [cross_val_score(classifier, X_train, y_train, cv=8, n_jobs=-1)]
    log("Crosvalidation completed in {0}s".format(time() - now))
    log("Accuracy: " + str(accuracy[0]))
    log("Average accuracy: " + str(np.array(accuracy[0]).mean()))
    log("===============================================")
    return accuracy

In [367]:
nb_acc = cv(BernoulliNB(), bow.iloc[:,1:], bow.iloc[:,0])

Crossvalidating BernoulliNB...
Crosvalidation completed in 21.083032608032227s
Accuracy: [0.77883959 0.74539249 0.76382253 0.74726776 0.74795082 0.76281613
 0.75393028 0.75529733]
Average accuracy: 0.7569146165589327


In [368]:
# Addtion of extra features:

# Number of Uppercase - tend to express postive/negative emotions by using uppercase words
# Number of !         - exclamation marks are likely to increase strength of opinion
# Number of ?         - might distinguish neutral tweets - seeking information
# Number of positive  - positive emoji will most likely occur in positive tweets
# emoticons
# Number of negative  - Inverse to the one above
# emoticons
# Number of ...       - commonly used in commenting something
# Number of quotations- same as above
# Number of mentions  - Lots of mentions on positive tweets, to share something good/bad
# Number of urls      - similar to number of mentions

In [369]:
#Detecting Emoticons
class EmoticonDetector:
    emoticons = {}
    
    def __init__(self, emoticon_file="../data/emoticons.txt"):
        from pathlib import Path
        content = Path(emoticon_file).read_text()
        positive = True
        for line in content.split("\n"):
            if "positive" in line.lower():
                positive = True
                continue
            elif "negative" in line.lower():
                positive = False
                continue
            
            self.emoticons[line] = positive
    
    def is_positive(self, emoticon):
        if emoticon in self.emoticons:
            return self.emoticons[emoticon]
        return False
    
    def is_emoticon(self, to_check):
        return to_check in self.emoticons

In [370]:
class ExtraFeatures(WordList):
    def __init__(self):
        pass
    
    def build_data_model(self):
        extra_columns = [col for col in self.processed_Traindata.columns if col.startswith("number_of")]
        label_column = ["label"]
        columns = label_column + extra_columns + list(
                map(lambda w: w + "_bow", self.wordlist))
        
        labels = []
        rows = []
        for idx in self.processed_Traindata.index:
            current_row = []
            current_label = self.processed_Traindata.loc[idx, "airline_sentiment"]
            labels.append(current_label)
            current_row.append(current_label)
        
            for _,col in enumerate(extra_columns):
                current_row.append(self.processed_Traindata.loc[idx, col])
        
        #adding bad-of-words
            tokens = set(self.processed_Traindata.loc[idx, "text"])
            for _,word in enumerate(self.wordlist):
                current_row.append(1 if word in tokens else 0)
        
            rows.append(current_row)
        
        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        
        return self.data_model, self.data_labels
    
    def add_column(self, column_name, column_content):
        self.processed_Traindata.loc[:, column_name] = pd.Series(column_content, index=self.processed_Traindata.index)

    def build_features(self):
        def count_by_lambda(expression, word_array):
            return len(list(filter(expression, word_array)))
        
        def count_occurences(character, word_array):
            counter = 0
            for j, word in enumerate(word_array):
                for char in word:
                    if char == character:
                        counter += 1
            return counter
        
        def count_by_regex(regex, plain_text):
            return len(regex.findall(plain_text))
        
        self.add_column("splitted_text", map(lambda txt: txt.split(" "), self.processed_Traindata["text"]))
        
        #Number of uppercase words
        uppercase = list(map(lambda txt: count_by_lambda(lambda word: word == word.upper(), txt),
                                                        self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_uppercase", uppercase)
        
        #number of !
        exclamations = list(map(lambda txt: count_occurences("!", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_exclamation", exclamations)
        
        #number of ?
        questions = list(map(lambda txt: count_occurences("?", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_question", questions)
        
        #number of ...
        ellipsis = list(map(lambda txt: count_by_regex(regex.compile(r"\.\s?\.\s?\."), txt),
                           self.processed_Traindata["text"]))
        self.add_column("number_of_ellipsis", ellipsis)
        
        #number of hashtags
        hashtags = list(map(lambda txt: count_occurences("#", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_hashtags", hashtags)
        
        #number of mentions
        mentions = list(map(lambda txt: count_occurences("@", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_mentions", mentions)
        
        #number of quotes
        quotes = list(map(lambda plain_text: int(count_occurences("'", [plain_text.strip("'").strip('"')]) / 2 +
                                                 count_occurences('"', [plain_text.strip("'").strip('"')]) / 2),
                          self.processed_Traindata["text"]))
        self.add_column("number_of_quotes", quotes)
        
        #number of urls
        urls = list(map(lambda txt: count_by_regex(regex.compile(r"http.?://[^\s]+[\s]?"), txt),
                             self.processed_Traindata["text"]))
        self.add_column("number_of_urls", urls)
        
        #number of positive emoticons
        ed = EmoticonDetector()
        positive_emo = list(
            map(lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and ed.is_positive(word), txt), 
                   self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_positive_emo", positive_emo)
        
        #number of negative emoticons
        negative_emo = list(
            map(lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and not ed.is_positive(word), txt), 
                   self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_negative_emo", negative_emo)
        
    

In [371]:
data = ExtraFeatures()
data.initialize("../data/Tweets.csv")
data.build_features()
data.cleaningData(DataPreprocessing())
data.tokenize()
data.stem()
data.buildWordlist()
data_model, labels = data.build_data_model()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [372]:
#Extended Features + Random Forest
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(data_model.iloc[:, 1:], data_model.iloc[:, 0],
                                                    train_size=0.7, stratify=data_model.iloc[:, 0],
                                                    random_state=seed)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, RandomForestClassifier(random_state=seed,n_estimators=403,n_jobs=-1))


From version 0.21, test_size will always complement train_size unless both are specified.




Testing RandomForestClassifier
Learning time 20.246517419815063s
Predicting time 0.4646904468536377s
            Negative     Neutral    Positive
F1         [0.84963691 0.53918495 0.62803738]
Precision  [0.80558931 0.6405959  0.66141732]
Recall     [0.89877994 0.46549391 0.59786477]
Accuracy   0.7595332953898691


In [373]:
#Crosvalidation
rf_acc = cv(RandomForestClassifier(n_estimators=403,n_jobs=-1, random_state=seed),data_model.iloc[:, 1:], data_model.iloc[:, 0])

Crossvalidating RandomForestClassifier...
Crosvalidation completed in 235.91511368751526s
Accuracy: [0.7774744  0.77663934 0.76639344 0.7670765  0.76571038 0.73019126
 0.75546448 0.7518797 ]
Average accuracy: 0.7613536889768202


In [374]:
from sklearn.naive_bayes import BernoulliNB

X_train, X_test, y_train, y_test = train_test_split(bow.iloc[:, 1:], bow.iloc[:, 0],
                                                   train_size = 0.7, stratify=bow.iloc[:, 0],
                                                   random_state = seed)

precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, BernoulliNB())


From version 0.21, test_size will always complement train_size unless both are specified.




Testing BernoulliNB
Learning time 1.0232014656066895s
Predicting time 0.3078019618988037s
            Negative     Neutral    Positive
F1         [0.8519774  0.57836066 0.58070501]
Precision  [0.84340045 0.57198444 0.61614173]
Recall     [0.86073059 0.58488064 0.54912281]
Accuracy   0.750996015936255
