In [80]:
#importing libraries
import nltk
import pandas as pd
#from emoticons import EmoticonDetector
import re as regex
import collections
import numpy as np
import plotly
from plotly import graph_objs
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from time import time
import gensim

#plotly configuration
plotly.offline.init_notebook_mode(connected=True)

In [81]:
class TwitterData_Initialize():
    data = []
    processed_Traindata = []
    processed_Testdata = []
    wordlist = []
    
    data_model = None
    data_labels = None
    
    def initialize(self, csv_file, from_cached=None):
        if from_cached is not None:
            self.data_model = pd.read_csv(from_cached)
            return
        
        self.data = pd.read_csv(csv_file, usecols=[0,1,5,10,15])
        train, test = train_test_split(self.data, test_size=0.2)
        self.processed_Traindata = train
        self.processed_Testdata = test
        self.wordlist = []
        self.data_model = None
        self.data_labels = None

In [82]:
data = TwitterData_Initialize()
data.initialize("Tweets.csv")
data.processed_Traindata.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,text,topic_label
9925,5.69613e+17,positive,US Airways,@USAirways despite mechanical issues and many ...,8
13713,5.69739e+17,negative,American,@AmericanAir stuck in airplane both on way out...,7
12015,5.70269e+17,negative,American,@AmericanAir any idea on what the wait time is...,8
7781,5.6925e+17,positive,Delta,@JetBlue you can't beat jetblue in space's mat...,9
10769,5.68895e+17,negative,US Airways,@USAirways I'm trying to Request Missed Mileag...,1


In [83]:
#Plotting the training data
df = data.processed_Traindata
negative = len(df[df["airline_sentiment"] == "negative"])
positive = len(df[df["airline_sentiment"] == "positive"])
neutral = len(df[df["airline_sentiment"] == "neutral"])

dist = [
    graph_objs.Bar(
        x = ["positive", "negative", "neutral"],
        y = [positive, negative, neutral],
    )]
plotly.offline.iplot({"data":dist, "layout":graph_objs.Layout(title="Sentiment type distribution in training set")})

In [84]:
#Data Preprocessing
class DataPreprocessing:
    def iterate(self):
        for preprocessingMethod in [self.removeUrls,
                                   self.removeUsernames,
                                   self.removeElongatedWords,
                                   self.removeNa,
                                   self.removeSpecialChars,
                                   self.removeNumbers]:
            yield preprocessingMethod
    
    @staticmethod
    def removeByRegex(tweets, regExp):
        tweets.loc[:, "text"].replace(regExp, "", inplace=True)
        return tweets
    
    def removeUrls(self, tweets):
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))
    
    def removeNa(self, tweets):
        return tweets[tweets["text"] != ""]
    
    def removeSpecialChars(self, tweets):
        for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$"
                                                                    "@", "%", "^", "*", "(", ")", "{", "}",
                                                                    "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                    "!", "?", ".", "'",
                                                                    "--", "---", "#"]):
            tweets.loc[:, "text"].replace(remove, "", inplace=True)
        return tweets
    
    def removeUsernames(self, tweets):
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"@[^\s]+[\s]?"))
    
    def removeElongatedWords(self, tweets):
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"(.)\1+', r'\1\1"))
    
    def removeNumbers(self, tweets):
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))

In [85]:
#Cleaning the Training Data
class CleanTrainingData(TwitterData_Initialize):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
        #self.processed_Testdata = previous.processed_Testdata
    
    def cleaningData(self, cleaner):
        train = self.processed_Traindata
        #test = self.processed_Testdata
        
        for cleanerMethod in cleaner.iterate():
            train = cleanerMethod(train)
            #test = cleanerMethod(test)
        self.processed_Traindata = train
        #self.processed_Testdata = test

In [86]:
data = CleanTrainingData(data)
data.cleaningData(DataPreprocessing())
data.processed_Traindata.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,text,topic_label
9925,5.69613e+17,positive,US Airways,despite mechanical issues and many delays foll...,8
13713,5.69739e+17,negative,American,stuck in airplane both on way out of PHL and a...,7
12015,5.70269e+17,negative,American,any idea on what the wait time is for refunds ...,8
7781,5.6925e+17,positive,Delta,you cant beat jetblue in spaces matter,9
10769,5.68895e+17,negative,US Airways,Im trying to Request Missed Mileage and it kee...,1


In [87]:
#Tokenizing and Stemming the data
class TokenizationStemming(CleanTrainingData):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
        #self.processed_TestData = previous.processed_TestData
    
    def stem(self, stemmer = nltk.PorterStemmer()):
        def stemJoin(row):
            row["text"] = list(map(lambda str: stemmer.stem(str.lower()), row["text"]))
            return row
    
        self.processed_Traindata = self.processed_Traindata.apply(stemJoin, axis=1)
    
    def tokenize(self, tokenizer = nltk.word_tokenize):
        def tokenizeRow(row):
            row["text"] = tokenizer(row["text"])
            row["tokenizedText"] = [] + row["text"]
            return row
        
        self.processed_Traindata = self.processed_Traindata.apply(tokenizeRow, axis=1)

In [88]:
data = TokenizationStemming(data)
data.tokenize()
data.stem()
data.processed_Traindata.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,text,topic_label,tokenizedText
9925,5.69613e+17,positive,US Airways,"[despit, mechan, issu, and, mani, delay, follo...",8,"[despite, mechanical, issues, and, many, delay..."
13713,5.69739e+17,negative,American,"[stuck, in, airplan, both, on, way, out, of, p...",7,"[stuck, in, airplane, both, on, way, out, of, ..."
12015,5.70269e+17,negative,American,"[ani, idea, on, what, the, wait, time, is, for...",8,"[any, idea, on, what, the, wait, time, is, for..."
7781,5.6925e+17,positive,Delta,"[you, cant, beat, jetblu, in, space, matter]",9,"[you, cant, beat, jetblue, in, spaces, matter]"
10769,5.68895e+17,negative,US Airways,"[im, tri, to, request, miss, mileag, and, it, ...",1,"[Im, trying, to, Request, Missed, Mileage, and..."


In [10]:
#Building Wordlist
#Un-filtered version without removing stopwords
words = collections.Counter()
for idx in data.processed_Traindata.index:
    words.update(data.processed_Traindata.loc[idx, "text"])

words.most_common(5)

[('to', 6902), ('the', 4772), ('i', 4346), ('flight', 3829), ('a', 3588)]

In [11]:
#Removing stopwords
stopwords = nltk.corpus.stopwords.words("english")
whitelist = ["n't", "not"]
for idx, stop_word in enumerate(stopwords):
    if stop_word not in whitelist:
        del words[stop_word]

words.most_common(5)

[('flight', 3829), ('thank', 1342), ('not', 1325), ('get', 1318), ('wa', 1296)]

In [12]:
#Generating the final wordlist
class WordList(TokenizationStemming):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
    
    whitelist = ["n't", "not"]
    wordlist = []
    
    def buildWordlist(self, min_occurrences=3, max_occurences=1000, stopwords=nltk.corpus.stopwords.words("english"),
                     whitelist=None):
        self.wordlist = []
        whitelist = self.whitelist if whitelist is None else whitelist
        import os
        if os.path.isfile('wordlist.csv'):
            word_df = pd.read_csv('wordlist.csv', encoding = "ISO-8859-1")
            word_df = word_df[word_df["occurrences"] > min_occurrences]
            self.wordlist = list(word_df.loc[:, "word"])
            return
        words = collections.Counter()
        for idx in self.processed_Traindata.index:
            words.update(self.processed_Traindata.loc[idx, "text"])
        
        for idx, stop_word in enumerate(stopwords):
            if stop_word not in whitelist:
                del words[stop_word]
        
        word_df = pd.DataFrame(data={"word" : [k for k, v in words.most_common() if min_occurrences < v < max_occurences],
                                    "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]},
                              columns = ["word", "occurrences"])
        
        word_df.to_csv("wordlist.csv", index_label="idx", encoding = "utf8")
        self.wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences]
    

In [13]:
data = WordList(data)
data.buildWordlist()

In [14]:
words = pd.read_csv("wordlist.csv", encoding = "ISO-8859-1")
x_words = list(words.loc[0:10, "word"])
x_words.reverse()
y_occ = list(words.loc[0:10, "occurrences"])
y_occ.reverse()

dist = [
    graph_objs.Bar(
        x = y_occ,
        y = x_words,
        orientation = "h"
    )]
plotly.offline.iplot({"data":dist, "layout":graph_objs.Layout(title="Top Words in the built Wordlist")})

In [15]:
#Transforming into Bag-of-Words
class BagOfWords(WordList):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
        self.wordlist = previous.wordlist
    
    def buildDataModel(self):
        labelColumn = ["label"]
        columns = labelColumn + list(
            map(lambda w: w + "_bow", self.wordlist))
        labels = []
        rows = []
        
        for idx in self.processed_Traindata.index:
            currentRow = []
            currentLabel = self.processed_Traindata.loc[idx, "airline_sentiment"]
            labels.append(currentLabel)
            currentRow.append(currentLabel)
            
            tokens = set(self.processed_Traindata.loc[idx, "text"])
            for _, word in enumerate(self.wordlist):
                currentRow.append(1 if word in tokens else 0)
            
            rows.append(currentRow)
        
        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        
        return self.data_model, self.data_labels

In [16]:
data = BagOfWords(data)
bow, labels = data.buildDataModel()
bow.head(5)

Unnamed: 0,label,hour_bow,cancel_bow,help_bow,servic_bow,delay_bow,time_bow,custom_bow,bag_bow,call_bow,...,gorgeou_bow,woohoo_bow,thousand_bow,understat_bow,furiou_bow,manual_bow,smell_bow,ber_bow,charleston_bow,nrt_bow
0,negative,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,negative,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,negative,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,negative,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,negative,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
grouped = bow.groupby(["label"]).sum()
words_to_visualize = []
sentiments = ["positive", "neutral", "negative"]

#get the most 7 common words for every sentiment
for sentiment in sentiments:
    words = grouped.loc[sentiment,:]
    words.sort_values(inplace=True, ascending=False)
    for w in words.index[:7]:
        if w not in words_to_visualize:
            words_to_visualize.append(w)

#visualizing the words
plot_data = []
for sentiment in sentiments:
    plot_data.append(graph_objs.Bar(
            x = [w.split("_")[0] for w in words_to_visualize],
            y = [grouped.loc[sentiment,w] for w in words_to_visualize],
            name = sentiment
    ))

plotly.offline.iplot({
    "data": plot_data,
    "layout": graph_objs.Layout(title="'Most Common words across sentiments")
})

In [18]:
import random
seed = 666
random.seed(seed)

In [19]:
#Utility function to train the classifier and show F1, Precision, recall and Accuracy values

def test_classifier(X_train, y_train, X_test, y_test, classifier):
    log("")
    log("==================================================")
    classifier_name = str(type(classifier).__name__)
    log("Testing " + classifier_name)
    now = time()
    list_of_labels = sorted(list(set(y_train)))
    model = classifier.fit(X_train, y_train)
    log("Learning time {0}s".format(time() - now))
    now = time()
    predictions = model.predict(X_test)
    log("Predicting time {0}s".format(time() - now))
    
    precision = precision_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    recall = recall_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    log("==================Results=======================")
    log("            Negative     Neutral    Positive")
    log("F1         " + str(f1))
    log("Precision  " + str(precision))
    log("Recall     " + str(recall))
    log("Accuracy   " + str(accuracy))
    log("================================================")
    
    return precision, recall, accuracy, f1

def log(x):
    print(x)
    

In [20]:
#Classifier : BagOfWords + NaiveBayes
from sklearn.naive_bayes import BernoulliNB

X_train, X_test, y_train, y_test = train_test_split(bow.iloc[:, 1:], bow.iloc[:, 0],
                                                   train_size = 0.7, stratify=bow.iloc[:, 0],
                                                   random_state = seed)

precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, BernoulliNB())


From version 0.21, test_size will always complement train_size unless both are specified.




Testing BernoulliNB
Learning time 0.735954999923706s
Predicting time 0.2744331359863281s
            Negative     Neutral    Positive
F1         [0.85962955 0.57930108 0.60245515]
Precision  [0.8458498  0.5731383  0.65773196]
Recall     [0.8738657  0.58559783 0.55574913]
Accuracy   0.7615253272623791


In [21]:
#NaiveBayes with 8 fold cross-validation

def cv(classifier, X_train, y_train):
    log("===============================================")
    classifier_name = str(type(classifier).__name__)
    now = time()
    log("Crossvalidating " + classifier_name + "...")
    accuracy = [cross_val_score(classifier, X_train, y_train, cv=8, n_jobs=-1)]
    log("Crosvalidation completed in {0}s".format(time() - now))
    log("Accuracy: " + str(accuracy[0]))
    log("Average accuracy: " + str(np.array(accuracy[0]).mean()))
    log("===============================================")
    return accuracy

In [22]:
nb_acc = cv(BernoulliNB(), bow.iloc[:,1:], bow.iloc[:,0])

Crossvalidating BernoulliNB...
Crosvalidation completed in 10.928978681564331s
Accuracy: [0.7665529  0.75767918 0.77269625 0.75767918 0.75734792 0.74641148
 0.74914559 0.75803144]
Average accuracy: 0.7581929925651858


In [23]:
# Addtion of extra features:

# Number of Uppercase - tend to express postive/negative emotions by using uppercase words
# Number of !         - exclamation marks are likely to increase strength of opinion
# Number of ?         - might distinguish neutral tweets - seeking information
# Number of positive  - positive emoji will most likely occur in positive tweets
# emoticons
# Number of negative  - Inverse to the one above
# emoticons
# Number of ...       - commonly used in commenting something
# Number of quotations- same as above
# Number of mentions  - Lots of mentions on positive tweets, to share something good/bad
# Number of urls      - similar to number of mentions

In [24]:
#Detecting Emoticons
class EmoticonDetector:
    emoticons = {}
    
    def __init__(self, emoticon_file=".\\emoticons.txt"):
        from pathlib import Path
        content = Path(emoticon_file).read_text()
        positive = True
        for line in content.split("\n"):
            if "positive" in line.lower():
                positive = True
                continue
            elif "negative" in line.lower():
                positive = False
                continue
            
            self.emoticons[line] = positive
    
    def is_positive(self, emoticon):
        if emoticon in self.emoticons:
            return self.emoticons[emoticon]
        return False
    
    def is_emoticon(self, to_check):
        return to_check in self.emoticons

In [25]:
class ExtraFeatures(WordList):
    def __init__(self):
        pass
    
    def build_data_model(self):
        extra_columns = [col for col in self.processed_Traindata.columns if col.startswith("number_of")]
        label_column = ["label"]
        columns = label_column + extra_columns + list(
                map(lambda w: w + "_bow", self.wordlist))
        
        labels = []
        rows = []
        for idx in self.processed_Traindata.index:
            current_row = []
            current_label = self.processed_Traindata.loc[idx, "airline_sentiment"]
            labels.append(current_label)
            current_row.append(current_label)
        
            for _,col in enumerate(extra_columns):
                current_row.append(self.processed_Traindata.loc[idx, col])
        
        #adding bad-of-words
            tokens = set(self.processed_Traindata.loc[idx, "text"])
            for _,word in enumerate(self.wordlist):
                current_row.append(1 if word in tokens else 0)
        
            rows.append(current_row)
        
        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        
        return self.data_model, self.data_labels
    
    def add_column(self, column_name, column_content):
        self.processed_Traindata.loc[:, column_name] = pd.Series(column_content, index=self.processed_Traindata.index)

    def build_features(self):
        def count_by_lambda(expression, word_array):
            return len(list(filter(expression, word_array)))
        
        def count_occurences(character, word_array):
            counter = 0
            for j, word in enumerate(word_array):
                for char in word:
                    if char == character:
                        counter += 1
            return counter
        
        def count_by_regex(regex, plain_text):
            return len(regex.findall(plain_text))
        
        self.add_column("splitted_text", map(lambda txt: txt.split(" "), self.processed_Traindata["text"]))
        
        #Number of uppercase words
        uppercase = list(map(lambda txt: count_by_lambda(lambda word: word == word.upper(), txt),
                                                        self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_uppercase", uppercase)
        
        #number of !
        exclamations = list(map(lambda txt: count_occurences("!", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_exclamation", exclamations)
        
        #number of ?
        questions = list(map(lambda txt: count_occurences("?", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_question", questions)
        
        #number of ...
        ellipsis = list(map(lambda txt: count_by_regex(regex.compile(r"\.\s?\.\s?\."), txt),
                           self.processed_Traindata["text"]))
        self.add_column("number_of_ellipsis", ellipsis)
        
        #number of hashtags
        hashtags = list(map(lambda txt: count_occurences("#", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_hashtags", hashtags)
        
        #number of mentions
        mentions = list(map(lambda txt: count_occurences("@", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_mentions", mentions)
        
        #number of quotes
        quotes = list(map(lambda plain_text: int(count_occurences("'", [plain_text.strip("'").strip('"')]) / 2 +
                                                 count_occurences('"', [plain_text.strip("'").strip('"')]) / 2),
                          self.processed_Traindata["text"]))
        self.add_column("number_of_quotes", quotes)
        
        #number of urls
        urls = list(map(lambda txt: count_by_regex(regex.compile(r"http.?://[^\s]+[\s]?"), txt),
                             self.processed_Traindata["text"]))
        self.add_column("number_of_urls", urls)
        
        #number of positive emoticons
        ed = EmoticonDetector()
        positive_emo = list(
            map(lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and ed.is_positive(word), txt), 
                   self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_positive_emo", positive_emo)
        
        #number of negative emoticons
        negative_emo = list(
            map(lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and not ed.is_positive(word), txt), 
                   self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_negative_emo", negative_emo)
        
    

In [26]:
data = ExtraFeatures()
data.initialize("Tweets.csv")
data.build_features()
data.cleaningData(DataPreprocessing())
data.tokenize()
data.stem()
data.buildWordlist()
data_model, labels = data.build_data_model()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [27]:
sentiments = ["positive", "neutral", "negative"]
plots_data_ef = []

for what in map(lambda o: "number_of_"+o,["positive_emo","negative_emo","exclamation","hashtags","question"]):
    ef_grouped = data_model[data_model[what]>=1].groupby(["label"]).count()
    plots_data_ef.append({"data":[graph_objs.Bar(
            x = sentiments,
            y = [ef_grouped.loc[s,:][0] for s in sentiments],
    )], "title":"How feature \""+what+"\" separates the tweets"})
    
for plot_data_ef in plots_data_ef:
    plotly.offline.iplot({
            "data":plot_data_ef["data"],
            "layout":graph_objs.Layout(title=plot_data_ef["title"])
    })

In [28]:
#Extended Features + Random Forest
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(data_model.iloc[:, 1:], data_model.iloc[:, 0],
                                                    train_size=0.7, stratify=data_model.iloc[:, 0],
                                                    random_state=seed)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, RandomForestClassifier(random_state=seed,n_estimators=403,n_jobs=-1))


From version 0.21, test_size will always complement train_size unless both are specified.




Testing RandomForestClassifier
Learning time 19.51445460319519s
Predicting time 0.40056657791137695s
            Negative     Neutral    Positive
F1         [0.85174044 0.5046729  0.6293578 ]
Precision  [0.80964052 0.6        0.65209125]
Recall     [0.89845875 0.43548387 0.60815603]
Accuracy   0.7538417757541264


In [29]:
#Crosvalidation
rf_acc = cv(RandomForestClassifier(n_estimators=403,n_jobs=-1, random_state=seed),data_model.iloc[:, 1:], data_model.iloc[:, 0])

Crossvalidating RandomForestClassifier...
Crosvalidation completed in 221.5852508544922s
Accuracy: [0.75290102 0.74931694 0.7670765  0.76434426 0.75546448 0.7636612
 0.76502732 0.76486671]
Average accuracy: 0.760332305813554


In [30]:
from sklearn.naive_bayes import BernoulliNB

X_train, X_test, y_train, y_test = train_test_split(bow.iloc[:, 1:], bow.iloc[:, 0],
                                                   train_size = 0.7, stratify=bow.iloc[:, 0],
                                                   random_state = seed)

precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, BernoulliNB())


From version 0.21, test_size will always complement train_size unless both are specified.




Testing BernoulliNB
Learning time 0.9169924259185791s
Predicting time 0.36939239501953125s
            Negative     Neutral    Positive
F1         [0.85962955 0.57930108 0.60245515]
Precision  [0.8458498  0.5731383  0.65773196]
Recall     [0.8738657  0.58559783 0.55574913]
Accuracy   0.7615253272623791


In [31]:
df = data.processed_Traindata
print(df.loc[df['topic_label'] == 2])

           tweet_id airline_sentiment         airline  \
5704   5.687940e+17           neutral       Southwest   
8341   5.684710e+17           neutral           Delta   
982    5.699760e+17          negative          United   
9058   5.702630e+17          negative      US Airways   
13171  5.699210e+17          negative        American   
1271   5.698590e+17          negative          United   
4467   5.702280e+17           neutral       Southwest   
3472   5.684130e+17          negative          United   
2881   5.688510e+17          negative          United   
13614  5.697990e+17          negative        American   
8461   5.682400e+17           neutral           Delta   
4380   5.702810e+17          negative       Southwest   
12608  5.700880e+17          negative        American   
10983  5.686390e+17          negative      US Airways   
8001   5.688900e+17           neutral           Delta   
13285  5.698960e+17          negative        American   
12978  5.699700e+17          ne

In [32]:
class Word2VecProvider(object):
    
    word2vec = None
    dimensions = 0
    
    def load(self, path_to_word2vec):
        self.word2vec = gensim.models.KeyedVectors.load_word2vec_format(path_to_word2vec, binary=False)
        self.word2vec.init_sims(replace=True)
        self.dimensions = self.word2vec.vector_size
    
    def get_vector(self, word):
        if word not in self.word2vec.vocab:
            return None

        return self.word2vec.syn0norm[self.word2vec.vocab[word].index]
    
    def get_similarity(self, word1, word2):
        if word1 not in self.word2vec.vocab or word2 not in self.word2vec.vocab:
            return None

        return self.word2vec.similarity(word1, word2)

In [33]:
word2vec = Word2VecProvider()

# REPLACE PATH TO THE FILE
word2vec.load("glove.twitter.27B.200d.txt")

In [34]:
class TwitterData(ExtraFeatures):
    
    def build_final_model(self, word2vec_provider, stopwords=nltk.corpus.stopwords.words("english")):
        whitelist = self.whitelist
        stopwords = list(filter(lambda sw: sw not in whitelist, stopwords))
        extra_columns = [col for col in self.processed_Traindata.columns if col.startswith("number_of")]
        similarity_columns = ["bad_similarity", "good_similarity", "information_similarity"]
        label_column = ["label"]
        columns = label_column + ["original_id"] + extra_columns + similarity_columns + list(
            map(lambda i: "word2vec_{0}".format(i), range(0, word2vec_provider.dimensions))) + list(
            map(lambda w: w + "_bow",self.wordlist))
        labels = []
        rows = []
        for idx in self.processed_Traindata.index:
            current_row = []
            current_label = self.processed_Traindata.loc[idx, "airline_sentiment"]
            labels.append(current_label)
            current_row.append(current_label)
            
            current_row.append(self.processed_Traindata.loc[idx, "tweet_id"])
            
            for _,col in enumerate(extra_columns):
                current_row.append(self.processed_Traindata.loc[idx, col])
            
            #average similarities with words
            tokens = self.processed_Traindata.loc[idx, "tokenizedText"]
            for main_word in map(lambda w: w.split("_")[0], similarity_columns):
                current_similarities = [abs(sim) for sim in
                                        map(lambda word: word2vec_provider.get_similarity(main_word, word.lower()), tokens) if
                                        sim is not None]
                if len(current_similarities) <= 1:
                    current_row.append(0 if len(current_similarities) == 0 else current_similarities[0])
                    continue
                max_sim = max(current_similarities)
                min_sim = min(current_similarities)
                current_similarities = [((sim - min_sim) / (max_sim - min_sim)) for sim in
                                        current_similarities]  # normalize to <0;1>
                current_row.append(np.array(current_similarities).mean())
            # add word2vec vector
            tokens = self.processed_Traindata.loc[idx, "tokenizedText"]
            current_word2vec = []
            for _, word in enumerate(tokens):
                vec = word2vec_provider.get_vector(word.lower())
                if vec is not None:
                    current_word2vec.append(vec)
            if len(current_word2vec) == 0:
                averaged_word2vec = [0] * 100
            elif len(current_word2vec) > 0:
                averaged_word2vec = list(np.array(current_word2vec).mean(axis=0))
            current_row += averaged_word2vec
            
            # add bag-of-words
            tokens = set(self.processed_Traindata.loc[idx, "text"])
            for _, word in enumerate(self.wordlist):
                current_row.append(1 if word in tokens else 0)

            rows.append(current_row)
        
        self.data_model = pd.DataFrame(rows, columns=columns).fillna(0)
        self.data_labels = pd.Series(labels)
        return self.data_model, self.data_labels     

In [35]:
td = TwitterData()
td.initialize("Tweets.csv")
td.build_features()
td.cleaningData(DataPreprocessing())
td.tokenize()
td.stem()
td.buildWordlist()
td.build_final_model(word2vec)

td.data_model.head(5)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Call to deprecated `syn0norm` (Attribute will be removed in 4.0.0, use self.wv.vectors_norm instead).


invalid value encountered in double_scalars



Unnamed: 0,label,original_id,number_of_uppercase,number_of_exclamation,number_of_question,number_of_ellipsis,number_of_hashtags,number_of_mentions,number_of_quotes,number_of_urls,...,gorgeou_bow,woohoo_bow,thousand_bow,understat_bow,furiou_bow,manual_bow,smell_bow,ber_bow,charleston_bow,nrt_bow
0,negative,5.69363e+17,0,0,0,0,2,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,negative,5.69498e+17,2,0,1,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,neutral,5.68109e+17,2,0,0,0,1,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,neutral,5.68283e+17,1,0,0,0,0,1,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,positive,5.6966e+17,0,1,1,0,0,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
data_model = td.data_model
data_model.drop("original_id",axis=1,inplace=True)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(data_model.iloc[:, 1:], data_model.iloc[:, 0],
                                                    train_size=0.7, stratify=data_model.iloc[:, 0],
                                                    random_state=seed)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, RandomForestClassifier(n_estimators=403,n_jobs=-1, random_state=seed))
rf_acc = cv(RandomForestClassifier(n_estimators=403,n_jobs=-1,random_state=seed),data_model.iloc[:, 1:], data_model.iloc[:, 0])


From version 0.21, test_size will always complement train_size unless both are specified.




Testing RandomForestClassifier
Learning time 11.059098482131958s
Predicting time 0.36649298667907715s
            Negative     Neutral    Positive
F1         [0.83622201 0.41398866 0.54850299]
Precision  [0.73101805 0.70873786 0.85447761]
Recall     [0.97679709 0.29238985 0.40388007]
Accuracy   0.7384746727376209
Crossvalidating RandomForestClassifier...
Crosvalidation completed in 138.20311641693115s
Accuracy: [0.74283765 0.75699659 0.74931694 0.74385246 0.74180328 0.73205742
 0.74914559 0.734108  ]
Average accuracy: 0.743764740361236
