In [234]:
#importing libraries
import nltk
import pandas as pd
#from emoticons import EmoticonDetector
import re as regex
import collections
import numpy as np
import plotly
from plotly import graph_objs
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from time import time
import gensim
from nltk.corpus import words

#plotly configuration
plotly.offline.init_notebook_mode(connected=True)

In [235]:
class TwitterData_Initialize():
    data = []
    processed_Traindata = []
    processed_Testdata = []
    wordlist = []
    
    data_model = None
    data_labels = None
    
    def initialize(self, csv_file, from_cached=None):
        if from_cached is not None:
            self.data_model = pd.read_csv(from_cached)
            return
        
        self.data = pd.read_csv(csv_file, usecols=[0,1,5,10,15])
        train, test = train_test_split(self.data, test_size=0.99)
        self.processed_Traindata = train
        self.processed_Testdata = test
        self.wordlist = []
        self.data_model = None
        self.data_labels = None

In [236]:
data = TwitterData_Initialize()
data.initialize("../data/Tweets.csv")
data.processed_Traindata

Unnamed: 0,tweet_id,airline_sentiment,airline,text,topic_label
6420,5.678700e+17,negative,Southwest,@SouthwestAir According to TV passenger interv...,6
12569,5.701010e+17,negative,American,@AmericanAir your customer service is deplora...,2
97,5.699070e+17,negative,Virgin America,@VirginAmerica - Let 2 scanned in passengers l...,6
4439,5.702550e+17,positive,Southwest,@SouthwestAir strives to be 'Customer Centric'...,7
117,5.697780e+17,positive,Virgin America,@VirginAmerica and again! Another rep kicked b...,2
13988,5.696810e+17,neutral,American,@AmericanAir I'm on #1058 tmrw from CUN DFW. F...,1
207,5.692480e+17,neutral,Virgin America,@VirginAmerica can you please get me to the ne...,4
4122,5.678140e+17,negative,United,@united @gg8929 so why did you Cancelled Fligh...,2
8113,5.687790e+17,neutral,Delta,@JetBlue are there any food places open at the...,9
3367,5.684930e+17,negative,United,@united 777 from SFO to HNL with ZERO entertai...,6


In [237]:
#Data Preprocessing
nltk.download('words')
word_dictionary = list(set(words.words()))

for alphabet in "bcdefghjklmnopqrstuvwxyzBCDEFGHJKLMNOPQRSTUVWXYZ":
    word_dictionary.remove(alphabet)
class DataPreprocessing:
    def iterate(self):
        for preprocessingMethod in [self.replaceProcessedHashtags,
                                   self.removeUrls,
                                   self.removeUsernames,
                                   self.removeElongatedWords,
                                   self.removeNa,
                                   self.replaceSlangWords,
                                   self.removeSpecialChars,
                                   self.removeNumbers]:
            yield preprocessingMethod
    
    @staticmethod
    def removeByRegex(tweets, regExp):
        tweets.loc[:, "text"].replace(regExp, "", inplace=True)
        return tweets
    
    def removeUrls(self, tweets):
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))
    
    def removeNa(self, tweets):
        return tweets[tweets["text"] != ""]
    
    def removeSpecialChars(self, tweets):
        for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$"
                                                                    "@", "%", "^", "*", "(", ")", "{", "}",
                                                                    "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                    "!", "?", ".", "'",
                                                                    "--", "---","#"]):
            tweets.loc[:, "text"].replace(remove, "", inplace=True)
        return tweets
    
    def removeUsernames(self, tweets):
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"@[^\s]+[\s]?"))
    
    def removeElongatedWords(self, tweets):
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"(.)\1+', r'\1\1"))
    
    def removeNumbers(self, tweets):
        #print(tweets)
        return DataPreprocessing.removeByRegex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))
    
    def replaceSlangWords(self, tweets):
        with open('../data/slang.txt') as file:
            slang_map = dict(map(str.strip, line.partition('\t')[::2])
            for line in file if line.strip())
            #print(tweets["text"])
            #print("-----------------------------------------END")
            for index,word in tweets['text'].iteritems():
                #print(index)
                for i in word.split():
                    isUpperCase = i.isupper()
                    i = i.lower()
                    if i in slang_map.keys():
                        word = word.replace(i, slang_map[i])
                        tweets.loc[(index),"text"] = word
                if isUpperCase:
                    i = i.upper()
        #print(tweets.loc[:,"text"])
        return tweets

    # print(split_tweets)
    @staticmethod
    def removeDigitsFromHashtag(tag):
        tag = regex.sub(r"\s?[0-9]+\.?[0-9]*", "", tag)
        return tag

    @staticmethod
    def collect_hashtags_in_tweet(wordList):
        hashtags = []
        for word in wordList:
            index = word.find('#')
            if index != -1:
                if word[index + 1:] != '':
                    hashtags.append(word[index + 1:])
        return hashtags

    @staticmethod
    def split_hashtag_to_words_all_possibilities(hashtag):
        all_possibilities = []

        split_posibility = [hashtag[:i] in word_dictionary for i in reversed(range(len(hashtag) + 1))]
        possible_split_positions = [i for i, x in enumerate(split_posibility) if x == True]

        for split_pos in possible_split_positions:
            split_words = []
            word_1, word_2 = hashtag[:len(hashtag) - split_pos], hashtag[len(hashtag) - split_pos:]

            if word_2 in word_dictionary:
                split_words.append(word_1)
                split_words.append(word_2)
                all_possibilities.append(split_words)

                another_round = DataPreprocessing.split_hashtag_to_words_all_possibilities(word_2)

                if len(another_round) > 0:
                    all_possibilities = all_possibilities + [[a1] + a2 for a1, a2, in
                                                             zip([word_1] * len(another_round), another_round)]
            else:
                another_round = DataPreprocessing.split_hashtag_to_words_all_possibilities(word_2)

                if len(another_round) > 0:
                    all_possibilities = all_possibilities + [[a1] + a2 for a1, a2, in
                                                             zip([word_1] * len(another_round), another_round)]

        return all_possibilities

    @staticmethod
    def process_all_hashtags_in_tweet(hashtags):
        all_words = []
        for tag in hashtags:
            split_hashtag = DataPreprocessing.split_hashtag_to_words_all_possibilities(DataPreprocessing.removeDigitsFromHashtag(tag))
            if split_hashtag:
                all_words = all_words + split_hashtag[0]
            else:
                all_words.append(tag)
        return all_words 
    
    def replaceProcessedHashtags(self, tweets):
            for index,word in tweets['text'].iteritems():
                word=word.split()
                collectHashtags=DataPreprocessing.collect_hashtags_in_tweet(word)
                allHashtags=DataPreprocessing.process_all_hashtags_in_tweet(collectHashtags)
                collectHashtags = ["#" + tag for tag in collectHashtags]
                if allHashtags:
                    word = list(set(word) - set(collectHashtags))
                    word = word + allHashtags
                word = " ".join(word)
                tweets.loc[(index),"text"] = word
                print(tweets.loc[(index),"text"])
           # print(tweets)
            return tweets
        


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\harin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [238]:
#Cleaning the Training Data
class CleanTrainingData(TwitterData_Initialize):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
        #self.processed_Testdata = previous.processed_Testdata
    
    def cleaningData(self, cleaner):
        train = self.processed_Traindata
        #test = self.processed_Testdata
        for cleanerMethod in cleaner.iterate():
            train = cleanerMethod(train)
            #test = cleanerMethod(test)
        self.processed_Traindata = train
        #self.processed_Testdata = test

In [239]:

data = CleanTrainingData(data)

data.cleaningData(DataPreprocessing())

#data.processed_Traindata.head(5)

@SouthwestAir According to TV passenger interviews, the landing was far from "uneventful" with heavy (panic) breaking to
@AmericanAir your customer service is deplorable. I am disgusted in your company and the ignorant people on the phones for lost baggage.
remove bag 2 the from scanned leave @VirginAmerica class plane bin? someone told their in 1st - passengers to than Let uncomfort able
@SouthwestAir they advertising, 'Customer etc. do journey, in customer - to be communications, Centric' strives everything ANAmarketers
@VirginAmerica and again! Another rep kicked butt! Naelah represents your team so beautifully!! Thank you!!!
morning. CUN rain Should on from dest'n I DFW. ORD. Final is in DFW freezing all Seeing tmrw @AmericanAir connection? I'm my change 1058
@VirginAmerica can you please get me to the new york area before monday afternoon
when Cancelled the people @united didn't rate? tickets you Flight thousands of like for @gg8929 so did why exchange doublestandards
@JetBlue are

fast @SouthwestAir didn't thank a we have expect response, to you! honest, Off be go! Chiberia
alright @JetBlue.... done! alternatively, if you'd like to charter a private jet for me to PITT i will gladly accept :)
@united actually we aren't. Still parked here.
@AmericanAir a guy try right...
@AmericanAir Thanks, have emailed them. How long should I expect for a response?
@USAirways fight was delayed 3 hrs in MCO, now I'm stuck in Philly with a standby ticket, flight 4009. Solution needed.
stewardess was the @USAirways what on a gate we there attendant friend going had and telling notour fault youd on tu sey our gat es right
@SouthwestAir thank u for not leaving me @me nice job running thru the airport to catch your connecting flight
@AmericanAir so you fail again flight to rdu sitting waiting on flight attendants. your logistics are not good
@AmericanAir i appreciate your apology. Sincerely. Thank you. That's really all I ever wanted to begin with.
@JetBlue I did not report the update

In [240]:
#Tokenizing and Stemming the data
class TokenizationStemming(CleanTrainingData):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
        #self.processed_TestData = previous.processed_TestData
    
    def stem(self, stemmer = nltk.PorterStemmer()):
        def stemJoin(row):
            row["text"] = list(map(lambda str: stemmer.stem(str.lower()), row["text"]))
            return row
    
        self.processed_Traindata = self.processed_Traindata.apply(stemJoin, axis=1)
    
    def tokenize(self, tokenizer = nltk.word_tokenize):
        def tokenizeRow(row):
            row["text"] = tokenizer(row["text"])
            row["tokenizedText"] = [] + row["text"]
            return row
        
        self.processed_Traindata = self.processed_Traindata.apply(tokenizeRow, axis=1)

In [242]:
nltk.download('punkt')
data = TokenizationStemming(data)
data.tokenize()
data.stem()
data.processed_Traindata.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Unnamed: 0,tweet_id,airline_sentiment,airline,text,topic_label,tokenizedText
6420,5.6787e+17,negative,Southwest,"[accord, to, tv, passeng, interview, the, land...",6,"[According, to, TV, passenger, interviews, the..."
12569,5.70101e+17,negative,American,"[your, custom, servic, is, deplor, i, am, disg...",2,"[your, customer, service, is, deplorable, I, a..."
97,5.69907e+17,negative,Virgin America,"[remov, bag, the, from, scan, leav, class, pla...",6,"[remove, bag, the, from, scanned, leave, class..."
4439,5.70255e+17,positive,Southwest,"[they, advertis, custom, etc, do, journey, in,...",7,"[they, advertising, Customer, etc, do, journey..."
117,5.69778e+17,positive,Virgin America,"[and, again, anoth, rep, kick, butt, naelah, r...",2,"[and, again, Another, rep, kicked, butt, Naela..."


In [243]:
#Building Wordlist
#Un-filtered version without removing stopwords
words = collections.Counter()
for idx in data.processed_Traindata.index:
    words.update(data.processed_Traindata.loc[idx, "text"])

words.most_common(5)

[('to', 80), ('flight', 56), ('i', 52), ('the', 51), ('a', 45)]

In [244]:
#Removing stopwords
stopwords = nltk.corpus.stopwords.words("english")
whitelist = ["n't", "not"]
for idx, stop_word in enumerate(stopwords):
    if stop_word not in whitelist:
        del words[stop_word]

words.most_common(5)

[('flight', 56), ('not', 26), ('thank', 21), ('get', 18), ('wa', 15)]

In [245]:
#Generating the final wordlist
class WordList(TokenizationStemming):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
    
    whitelist = ["n't", "not"]
    wordlist = []
    
    def buildWordlist(self, min_occurrences=3, max_occurences=3000, stopwords=nltk.corpus.stopwords.words("english"),
                     whitelist=None):
        self.wordlist = []
        whitelist = self.whitelist if whitelist is None else whitelist
        import os
        if os.path.isfile('../data/wordlist.csv'):
            word_df = pd.read_csv('../data/wordlist.csv', encoding = "ISO-8859-1")
            word_df = word_df[word_df["occurrences"] > min_occurrences]
            self.wordlist = list(word_df.loc[:, "word"])
            return
        words = collections.Counter()
        for idx in self.processed_Traindata.index:
            words.update(self.processed_Traindata.loc[idx, "text"])
        
        for idx, stop_word in enumerate(stopwords):
            if stop_word not in whitelist:
                del words[stop_word]
        
        word_df = pd.DataFrame(data={"word" : [k for k, v in words.most_common() if min_occurrences < v < max_occurences],
                                    "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]},
                              columns = ["word", "occurrences"])
        
        word_df.to_csv("../data/wordlist.csv", index_label="idx", encoding = "utf8")
        self.wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences]
    

In [246]:
data = WordList(data)
data.buildWordlist()

In [247]:
#Transforming into Bag-of-Words
class BagOfWords(WordList):
    def __init__(self, previous):
        self.processed_Traindata = previous.processed_Traindata
        self.wordlist = previous.wordlist
    
    def buildDataModel(self):
        labelColumn = ["label"]
        columns = labelColumn + list(
            map(lambda w: w + "_bow", self.wordlist))
        labels = []
        rows = []
        
        for idx in self.processed_Traindata.index:
            currentRow = []
            currentLabel = self.processed_Traindata.loc[idx, "airline_sentiment"]
            labels.append(currentLabel)
            currentRow.append(currentLabel)
            
            tokens = set(self.processed_Traindata.loc[idx, "text"])
            for _, word in enumerate(self.wordlist):
                currentRow.append(1 if word in tokens else 0)
            
            rows.append(currentRow)
        
        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        
        return self.data_model, self.data_labels

In [248]:
data = BagOfWords(data)
bow, labels = data.buildDataModel()
bow.head(5)

Unnamed: 0,label,hour_bow,cancel_bow,help_bow,servic_bow,delay_bow,time_bow,custom_bow,bag_bow,call_bow,...,gorgeou_bow,woohoo_bow,thousand_bow,understat_bow,furiou_bow,manual_bow,smell_bow,ber_bow,charleston_bow,nrt_bow
0,negative,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,negative,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,negative,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,positive,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,positive,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [249]:
import random
seed = 666
random.seed(seed)

In [250]:
#Utility function to train the classifier and show F1, Precision, recall and Accuracy values

def test_classifier(X_train, y_train, X_test, y_test, classifier):
    log("")
    log("==================================================")
    classifier_name = str(type(classifier).__name__)
    log("Testing " + classifier_name)
    now = time()
    list_of_labels = sorted(list(set(y_train)))
    model = classifier.fit(X_train, y_train)
    log("Learning time {0}s".format(time() - now))
    now = time()
    predictions = model.predict(X_test)
    log("Predicting time {0}s".format(time() - now))
    
    precision = precision_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    recall = recall_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    log("==================Results=======================")
    log("            Negative     Neutral    Positive")
    log("F1         " + str(f1))
    log("Precision  " + str(precision))
    log("Recall     " + str(recall))
    log("Accuracy   " + str(accuracy))
    log("================================================")
    
    return precision, recall, accuracy, f1

def log(x):
    print(x)
    

In [251]:
#Classifier : BagOfWords + NaiveBayes
from sklearn.naive_bayes import BernoulliNB

X_train, X_test, y_train, y_test = train_test_split(bow.iloc[:, 1:], bow.iloc[:, 0],
                                                   train_size = 0.7, stratify=bow.iloc[:, 0],
                                                   random_state = seed)

precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, BernoulliNB())


Testing BernoulliNB
Learning time 0.03931450843811035s
Predicting time 0.0050013065338134766s
            Negative     Neutral    Positive
F1         [0.76056338 0.         0.        ]
Precision  [0.61363636 0.         0.        ]
Recall     [1. 0. 0.]
Accuracy   0.6136363636363636



From version 0.21, test_size will always complement train_size unless both are specified.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



In [252]:
#NaiveBayes with 8 fold cross-validation

def cv(classifier, X_train, y_train):
    log("===============================================")
    classifier_name = str(type(classifier).__name__)
    now = time()
    log("Crossvalidating " + classifier_name + "...")
    accuracy = [cross_val_score(classifier, X_train, y_train, cv=8, n_jobs=-1)]
    log("Crosvalidation completed in {0}s".format(time() - now))
    log("Accuracy: " + str(accuracy[0]))
    log("Average accuracy: " + str(np.array(accuracy[0]).mean()))
    log("===============================================")
    return accuracy

In [253]:
nb_acc = cv(BernoulliNB(), bow.iloc[:,1:], bow.iloc[:,0])

Crossvalidating BernoulliNB...
Crosvalidation completed in 10.581453561782837s
Accuracy: [0.63157895 0.63157895 0.61111111 0.61111111 0.61111111 0.61111111
 0.61111111 0.61111111]
Average accuracy: 0.6162280701754386


In [254]:
# Addtion of extra features:

# Number of Uppercase - tend to express postive/negative emotions by using uppercase words
# Number of !         - exclamation marks are likely to increase strength of opinion
# Number of ?         - might distinguish neutral tweets - seeking information
# Number of positive  - positive emoji will most likely occur in positive tweets
# emoticons
# Number of negative  - Inverse to the one above
# emoticons
# Number of ...       - commonly used in commenting something
# Number of quotations- same as above
# Number of mentions  - Lots of mentions on positive tweets, to share something good/bad
# Number of urls      - similar to number of mentions

In [255]:
#Detecting Emoticons
class EmoticonDetector:
    emoticons = {}
    
    def __init__(self, emoticon_file="../data/emoticons.txt"):
        from pathlib import Path
        content = Path(emoticon_file).read_text()
        positive = True
        for line in content.split("\n"):
            if "positive" in line.lower():
                positive = True
                continue
            elif "negative" in line.lower():
                positive = False
                continue
            
            self.emoticons[line] = positive
    
    def is_positive(self, emoticon):
        if emoticon in self.emoticons:
            return self.emoticons[emoticon]
        return False
    
    def is_emoticon(self, to_check):
        return to_check in self.emoticons

In [259]:
class ExtraFeatures(WordList):
    def __init__(self):
        pass
    
    def build_data_model(self):
        extra_columns = [col for col in self.processed_Traindata.columns if col.startswith("number_of")]
        label_column = ["label"]
        columns = label_column + extra_columns + list(
                map(lambda w: w + "_bow", self.wordlist))
        
        labels = []
        rows = []
        for idx in self.processed_Traindata.index:
            current_row = []
            current_label = self.processed_Traindata.loc[idx, "airline_sentiment"]
            labels.append(current_label)
            current_row.append(current_label)
        
            for _,col in enumerate(extra_columns):
                current_row.append(self.processed_Traindata.loc[idx, col])
        
        #adding bag-of-words
            tokens = set(self.processed_Traindata.loc[idx, "text"])
            for _,word in enumerate(self.wordlist):
                current_row.append(1 if word in tokens else 0)
        
            rows.append(current_row)
        
        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        
        return self.data_model, self.data_labels
    
    def add_column(self, column_name, column_content):
        self.processed_Traindata.loc[:, column_name] = pd.Series(column_content, index=self.processed_Traindata.index)

    def build_features(self):
        def count_by_lambda(expression, word_array):
            return len(list(filter(expression, word_array)))
        
        def count_occurences(character, word_array):
            counter = 0
            for j, word in enumerate(word_array):
                for char in word:
                    if char == character:
                        counter += 1
            return counter
        
        def count_interjections(wordArray):
            interjections = []
            interjectionCount = 0
            with open('../data/interjections.txt') as file:
                interjections = file.read().splitlines()
            for word in wordArray:
                if word in interjections:
                    interjectionCount += 1
            return interjectionCount 

        def count_by_regex(regex, plain_text):
            return len(regex.findall(plain_text))
        
        self.add_column("splitted_text", map(lambda txt: txt.split(" "), self.processed_Traindata["text"]))
        
        #Number of uppercase words
        uppercase = list(map(lambda txt: count_by_lambda(lambda word: word == word.upper(), txt),
                                                        self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_uppercase", uppercase)
        
        #number of !
        exclamations = list(map(lambda txt: count_occurences("!", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_exclamation", exclamations)
        
        #number of ?
        questions = list(map(lambda txt: count_occurences("?", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_question", questions)
        
        #number of ...
        ellipsis = list(map(lambda txt: count_by_regex(regex.compile(r"\.\s?\.\s?\."), txt),
                           self.processed_Traindata["text"]))
        self.add_column("number_of_ellipsis", ellipsis)
        
        #number of hashtags
        hashtags = list(map(lambda txt: count_occurences("#", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_hashtags", hashtags)
        
        #number of mentions
        mentions = list(map(lambda txt: count_occurences("@", txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_mentions", mentions)
        
        #number of quotes
        quotes = list(map(lambda plain_text: int(count_occurences("'", [plain_text.strip("'").strip('"')]) / 2 +
                                                 count_occurences('"', [plain_text.strip("'").strip('"')]) / 2),
                          self.processed_Traindata["text"]))
        self.add_column("number_of_quotes", quotes)
        
        #number of urls
        urls = list(map(lambda txt: count_by_regex(regex.compile(r"http.?://[^\s]+[\s]?"), txt),
                             self.processed_Traindata["text"]))
        self.add_column("number_of_urls", urls)
        
        #number of positive emoticons
        ed = EmoticonDetector()
        positive_emo = list(
            map(lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and ed.is_positive(word), txt), 
                   self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_positive_emo", positive_emo)
        
        #number of negative emoticons
        negative_emo = list(
            map(lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and not ed.is_positive(word), txt), 
                   self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_negative_emo", negative_emo)
        
        #number of interjections
        interjections = list(map(lambda txt: count_interjections(txt),
                               self.processed_Traindata["splitted_text"]))
        self.add_column("number_of_interjections", interjections)
        

In [260]:
data = ExtraFeatures()
data.initialize("../data/Tweets.csv")
data.build_features()
data.cleaningData(DataPreprocessing())
data.tokenize()
data.stem()
data.buildWordlist()
data_model, labels = data.build_data_model()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Vegas for http://t.co/G9b6e0a2sZ @SouthwestAir the to concert So @Imaginedragons kid Thanks sending my awesome! DestinationDragons
@AmericanAir just messaged you. Please have someone contact us immediately.
Thanks @united for writing back. To assist you can return the bag you lost &amp; clean up the feces sprinkled in your bathroom. Too much to ask?
@SouthwestAir neveind, it's been found and on its way. Thanks for making the process so painless
@JetBlue OK, thank you.
@AmericanAir followed. I tried to @USAirways record locator number, gave me an error code
@AmericanAir Am on web site requesting refund for Cancelled Flightled flight. It requires numerical document number. Where get it?
now http://t.co/BuwjTVUWKM 2 Can't @USAirways hrs what?? anymore. for hrs. nothing. wait Now over frustrated USAirways
@united Pls post video of belligerent jerk ranting at SFO (1230) that's he's going to sue you for making him check his 3rd bag. He's a hoot!
@AmericanAir I spent $600 on my flight. Could 

but was Flighted Pilot @USAirways shitty. didn't up hours service UR we another pilot is showed because never to so come supposed waited Cancelled
@AmericanAir @USAirways "but I take meds that make me severely dehydrated" {sigh}
@united Okay, thank you for your help :)
@VirginAmerica I tried that. You offered to charge me an additional $1k for a new ticket or be stranded until Thurs. 1st time, last time.
@USAirways Cant help but be frustrated after an hour call with u ends up with a disconnection and no answers especially as div pref member.
@southwestair How can I refer a friend for the Southwest Credit card for points?
now when hold service? on some @SouthwestAir, we expect for hour been - customer an over can di sap pointed
flyers delays Cancelled @SouthwestAir the your credit a sort some get Flightlations of for all and hope shittydeal notimpressed
@united Thank you for the cheese platter and abundance of entertainment options. Time just flew by.
@united My favorite way to travel! 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [261]:
#Extended Features + Random Forest
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(data_model.iloc[:, 1:], data_model.iloc[:, 0],
                                                    train_size=0.7, stratify=data_model.iloc[:, 0],
                                                    random_state=seed)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, RandomForestClassifier(random_state=seed,n_estimators=403,n_jobs=-1))


From version 0.21, test_size will always complement train_size unless both are specified.




Testing RandomForestClassifier
Learning time 1.0259556770324707s
Predicting time 0.2662620544433594s
            Negative     Neutral    Positive
F1         [0.6779661  0.125      0.30769231]
Precision  [0.58823529 0.16666667 0.5       ]
Recall     [0.8        0.1        0.22222222]
Accuracy   0.5227272727272727


In [262]:
#Crosvalidation
rf_acc = cv(RandomForestClassifier(n_estimators=403,n_jobs=-1, random_state=seed),data_model.iloc[:, 1:], data_model.iloc[:, 0])

Crossvalidating RandomForestClassifier...
Crosvalidation completed in 12.786679029464722s
Accuracy: [0.55       0.7        0.57894737 0.5        0.72222222 0.64705882
 0.82352941 0.64705882]
Average accuracy: 0.6461020811833506


In [263]:
from sklearn.naive_bayes import BernoulliNB

X_train, X_test, y_train, y_test = train_test_split(bow.iloc[:, 1:], bow.iloc[:, 0],
                                                   train_size = 0.7, stratify=bow.iloc[:, 0],
                                                   random_state = seed)

precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, BernoulliNB())


Testing BernoulliNB
Learning time 0.009244918823242188s
Predicting time 0.005512237548828125s
            Negative     Neutral    Positive
F1         [0.76056338 0.         0.        ]
Precision  [0.61363636 0.         0.        ]
Recall     [1. 0. 0.]
Accuracy   0.6136363636363636



From version 0.21, test_size will always complement train_size unless both are specified.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.

