In [1]:
# https://www.kaggle.com/ngyptr/python-nltk-sentiment-analysis/comments
#load dependencies
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

In [2]:
data = pd.read_csv('../Data/Sentiment.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

In [3]:
# Splitting the dataset into train and test set (10% of the original dataset)
train, test = train_test_split(data,test_size = 0.1)
# Removing neutral sentiments; current focus positive and negative tweets
train = train[train.sentiment != "Neutral"]

In [4]:
train_pos = train[train['sentiment'] == 'Positive'] #selects positive rows
train_pos = train_pos['text'] #select the text column from the positive rows
train_neg = train[ train['sentiment'] == 'Negative'] #selects negative rows
train_neg = train_neg['text'] #select the text column from the negative rows

In [5]:
#clean the data
tweets = []
stopwords_set = set(stopwords.words("english"))

for index, row in train.iterrows():
    words_filtered = [e.lower() for e in row.text.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
    tweets.append((words_without_stopwords, row.sentiment))

#example data 
# (['replace', 'names', 'debaters', 'said', 'enough', 'least', 'tracks', 'straight'], 'Negative')
# (['gop', 'debate', 'took', 'awful', 'toll', 'ted', 'cruz.'], 'Negative')

In [6]:
test_pos = test[ test['sentiment'] == 'Positive']
test_pos = test_pos['text']
test_neg = test[ test['sentiment'] == 'Negative']
test_neg = test_neg['text']

In [14]:
# Extracting word features
# get all the words from the tweets

def get_words_in_tweets(tweets):
    all = []
    for (words, sentiment) in tweets:
        all.extend(words)
    return all

# words_tweets = get_words_in_tweets(tweets)
# print(words_tweets)

In [13]:
def get_word_features(wordlist):
    #https://onlinecoursetutorials.com/nlp/nltk-freqdist-function-with-example/
    wordlist = nltk.FreqDist(wordlist) #gets unique (keys, and their frequency)
    features = wordlist.keys()
    return features

#w_features are the unique words in the tweet
w_features = get_word_features(get_words_in_tweets(tweets))

In [16]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in w_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [17]:
# Training the Naive Bayes classifier
# https://www.nltk.org/api/nltk.classify.html
# apply_features: this function is to avoid the memory overhead involved 
# in storing all the featuresets for every token in a corpus.
training_set = nltk.classify.apply_features(extract_features,tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [19]:
neg_cnt = 0
pos_cnt = 0
for obj in test_neg: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 'Negative'): 
        neg_cnt = neg_cnt + 1
for obj in test_pos: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 'Positive'): 
        pos_cnt = pos_cnt + 1
        
print('[Negative]: %s/%s '  % (len(test_neg),neg_cnt))        
print('[Positive]: %s/%s '  % (len(test_pos),pos_cnt))    

[Negative]: 821/775 
[Positive]: 234/75 
