# Sentiment analysis using nltk nlp library

We used a Bernoulli Naive Bayes classifier on a bag of words representation of the tweets, and including bigrams. As preprocessing, we excluded all stopwords and punctuation, but included token words to signal whether the tweet had characteristics such as questions or interrogations.

In [1]:
#imports
import pandas as pd
import numpy as np
from sklearn import model_selection
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk import corpus
from nltk import word_tokenize
import re

In [2]:
stopwords = set(corpus.stopwords.words('english'))
#stopwords

In [3]:
def bigrams(text):
    bigrams = []
    
    #use airline separately
    #bigrams.append(text.split(' ',1)[0])
    #text = text.split(' ',1)[1]
    
    #if re.search('\!', text)!=None: bigrams.append("hasExclamation")
    #if re.search('\?', text)!=None: bigrams.append("hasQuestion")
    #if re.search('\!{2,}', text)!=None: bigrams.append("hasMultipleExclamation")
    #if re.search('\?{2,}', text)!=None: bigrams.append("hasMultipleQuestion")
    text = re.sub('\!+', " exclamationCode ", text) #make ! a word
    text = re.sub('\?+', " questionCode ", text) #make ? a word
    text = re.sub('\.+', " dotCode ", text) #make . a word
    #text = re.sub('\!', "", text) #remove !
    #text = re.sub('\?', "", text) #remove ?
    #text = re.sub(r'@\w*\b', "", text) #remove mentions
    #text = re.sub(r'@\w*\b', "@someone ", text) #make mentions anonymous
    #text = re.sub(r'#\w*\b', "hasHashtag ", text) #unify hashtags
    text = re.sub(',', " ", text) #remove ,

    i=-1
    
    #words = text.lower().split()
    
    #words = [i for i in word_tokenize(text.lower()) if i not in stopwords] #remove stopwords
    
    STOP_TYPES = ['CC','AT']
    tokens = nltk.pos_tag(nltk.word_tokenize(text.lower()))
    words = [w for w, wtype in tokens if wtype not in STOP_TYPES]

    for i in range(len(words) - 1):
        bigrams.append(words[i])
        bigrams.append(words[i] + " " + words[i+1])
    bigrams.append(words[i+1])
    
    return bigrams

In [4]:
import nltk

#Format Sentence Function
def format_sentence(sentence):
    return ({word: True for word in bigrams(sentence)})

In [5]:
#Read data
data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

labeled_tweets = data[['text', 'airline_sentiment']]
labeled_tweets['text'] = data.apply(lambda row: format_sentence(row['text']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [6]:
data_as_list = [[row['text'], row['airline_sentiment']] for index, row in labeled_tweets.iterrows()]

In [7]:
#Training and evaluating for several seeds

random_seeds=[4,8,15,16,23,42]
accuracies = []
for seed in random_seeds:
    rnd = np.random.RandomState(seed)
    indices = rnd.permutation(len(data_as_list))
    
    training_idx, test_idx = indices[:int(0.8 *len(indices))], indices[int(0.8 *len(indices)):]
    training = [data_as_list[i] for i in training_idx]
    test = [data_as_list[i] for i in test_idx]
    
    classifier = NaiveBayesClassifier.train(training)
    accuracies.append(accuracy(classifier, test))
    
print("Average accuracy:", sum(accuracies)/len(accuracies))

print(classifier.show_most_informative_features(100))

Average accuracy: 0.8000379434642383
Most Informative Features
       exclamationcode : = True           positi : negati =     42.0 : 1.0
     you exclamationcode = True           positi : negati =     38.2 : 1.0
       americanair thank = True           positi : negati =     36.5 : 1.0
      southwestair thank = True           positi : negati =     34.9 : 1.0
                     - ) = True           positi : negati =     33.8 : 1.0
    best exclamationcode = True           positi : negati =     33.8 : 1.0
                 excited = True           positi : negati =     33.8 : 1.0
           jetblue thank = True           positi : negati =     30.7 : 1.0
                 on hold = True           negati : neutra =     30.3 : 1.0
                 worries = True           positi : negati =     28.4 : 1.0
            the response = True           positi : negati =     28.4 : 1.0
              no worries = True           positi : negati =     28.4 : 1.0
               beautiful = True      

In [None]:
#Training for final upload
classifier = NaiveBayesClassifier.train(data_as_list)

In [None]:
#Writing to upload file. Change file name at will (next is 4th upload)
f = open('./Data/upload4.csv', 'w')
f.write('Id,Prediction\n')
for index, row in test_data.iterrows():
    f.write(str(row['Id']) + "," + classifier.classify(format_sentence(row['text'])) + '\n')
f.close()
            