In [1]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
from nltk import word_tokenize
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode

In [2]:
# a class that is created to get the voted classifier.
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf
        

In [3]:
import codecs
# import the IMDB reviews that are labeled
pos_lines_imdb = codecs.open("short_reviews/positive.txt","r",encoding="latin2").read()
neg_lines_imdb = codecs.open("short_reviews/negative.txt","r", encoding="latin2").read()

# import the own labeled tweets
pos_lines_own = [line.rstrip('\n') for line in open('Tagged tweets/pos_tagged.txt', 'r', encoding='ISO-8859-1')]
neg_lines_own = [line.rstrip('\n') for line in open('Tagged tweets/neg_tagged.txt', 'r', encoding='ISO-8859-1')]

In [4]:
all_words_imdb = []
documents_imdb = []

all_words_own = []
documents_own = []

In [5]:
# to select only adjects = J, adverbs = R and verb = V we delted all the other words in the text.
# Add the sentiment to the lines and append to documents_imdb and documents_own.
# only select the words that are in selected word types.
# append these to all_words_imdb and all_words_own

allowed_word_types = ["J"]

for p in pos_lines_imdb.split("\n"):
    documents_imdb.append((p,"pos"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words_imdb.append(w[0].lower())

for p in neg_lines_imdb.split("\n"):
    documents_imdb.append((p,"neg"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words_imdb.append(w[0].lower())
            
for p in pos_lines_own:
    documents_own.append((p,"pos"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words_own.append(w[0].lower())

for p in neg_lines_own:
    documents_own.append((p,"neg"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words_own.append(w[0].lower())

In [6]:
# see the most common words in the imdb labeled data
all_words_imdb = nltk.FreqDist(all_words_imdb)
print(all_words_imdb.most_common(100))
print(all_words_imdb["bitcoin"])

[('good', 369), ('more', 331), ('little', 265), ('funny', 245), ('much', 234), ('bad', 234), ('best', 208), ('new', 206), ('own', 185), ('many', 183), ('most', 167), ('other', 167), ('great', 160), ('big', 156), ('few', 139), ('first', 133), ('real', 132), ('i', 122), ('better', 116), ('full', 116), ('such', 114), ('romantic', 112), ('american', 110), ('old', 106), ('same', 103), ('original', 100), ('human', 100), ('hard', 98), ('[', 97), ('interesting', 97), ('young', 93), ('enough', 92), ('emotional', 89), ('least', 87), ('long', 83), ('last', 82), ('cinematic', 82), ('true', 75), ('entertaining', 75), ('high', 71), ('special', 70), ('predictable', 70), (']', 69), ('visual', 69), ('familiar', 63), ('whole', 63), ('comic', 63), ('enjoyable', 62), ('sweet', 60), ('narrative', 60), ('less', 60), ('short', 59), ('worst', 59), ('strong', 58), ('only', 58), ('fascinating', 53), ('obvious', 52), ('solid', 52), ('powerful', 51), ('modern', 51), ('french', 51), ('rare', 50), ('fresh', 50), ('

In [7]:
# see the most common words in the own labeled data
all_words_own = nltk.FreqDist(all_words_own)
print(all_words_own.most_common(100))
print(all_words_own["bitcoin"])

[('bitcoin', 117), ('ly', 95), ('ã¢', 50), ('new', 47), ('crypto', 46), ('top', 32), ('btc', 28), ('com', 22), ('last', 19), ('usd', 17), ('bulish', 17), ('gl', 17), ('next', 17), ('algory', 17), ('financial', 16), ('coin', 16), ('wil', 15), ('u', 15), ('stelar', 14), ('low', 14), ('major', 14), ('daily', 13), ('first', 13), ('digital', 13), ('short', 12), ('riple', 12), ('uk', 12), ('ap', 12), ('dolar', 11), ('great', 11), ('eth', 10), ('etf', 10), ('update', 10), ('right', 10), ('technical', 10), ('best', 9), ('fre', 9), ('strong', 9), ('global', 9), ('much', 9), ('least', 9), ('sel', 8), ('al', 8), ('january', 8), ('ready', 8), ('future', 8), ('stret', 8), ('net', 8), ('god', 8), ('moderate', 8), ('ltc', 7), ('smart', 7), ('real', 7), ('long', 7), ('regulatory', 7), ('n', 7), ('ful', 7), ('sv', 7), ('posible', 7), ('pic', 7), ('average', 7), ('se', 7), ('invest', 6), ('indian', 6), ('togle', 6), ('bad', 6), ('due', 6), ('ico', 6), ('important', 6), ('win', 6), ('dlvr', 6), ('total',

In [16]:
#commonly used words to train against, use the most common words to train on.
word_features_imdb = list(all_words_imdb.keys())[:5000]

word_features_own = list(all_words_own.keys())[:5000]

In [19]:
# define two functions to get the features
def find_features_imdb(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features_imdb:
        features[w] = (w in words)
    
    return features

def find_features_own(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features_own:
        features[w] = (w in words)
    
    return features

In [21]:
# get the featuresets of imdb and own
featuresets_imdb = [(find_features_imdb(rev), category) for (rev, category) in documents_imdb]

featuresets_own = [(find_features_own(rev), category) for (rev, category) in documents_own]

In [54]:
# random shuffle these feature sets so we can randomly select a training set and test set
random.shuffle(featuresets_imdb)
random.shuffle(featuresets_own)

In [55]:
# get the training and test sets
training_set_imdb = featuresets_imdb[:10000]
testing_set_imdb = featuresets_imdb[10000:]

training_set_own = featuresets_own[:600]
testing_set_own = featuresets_own[600:]

## Train the classifiers

In [56]:
# likelyhood = prior occurences x likihood / evidence

classifier = nltk.NaiveBayesClassifier.train(training_set_own)

print("Original Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set_own))*100)
classifier.show_most_informative_features(30)

Original Naive Bayes Algo accuracy: 57.826086956521735
Most Informative Features
                    bear = True              neg : pos    =      6.7 : 1.0
                    join = True              pos : neg    =      6.6 : 1.0
                    scam = True              neg : pos    =      4.8 : 1.0
                     xlm = True              pos : neg    =      4.5 : 1.0
                   alert = True              pos : neg    =      4.5 : 1.0
                   short = True              neg : pos    =      4.1 : 1.0
                     buf = True              neg : pos    =      4.1 : 1.0
                      th = True              pos : neg    =      3.8 : 1.0
                    best = True              pos : neg    =      3.8 : 1.0
                    info = True              pos : neg    =      3.8 : 1.0
                   first = True              pos : neg    =      3.7 : 1.0
                    know = True              neg : pos    =      3.6 : 1.0
                   

In [70]:
save_classifier = open("pickled_algorithms/own_labeled/originalnaivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [57]:
# Mulitnominal classifier
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set_own)
print("MNB_classifier accuracy:", (nltk.classify.accuracy(MNB_classifier, testing_set_own))*100)
#MNB_classifier.show_most_informative_features(15)

MNB_classifier accuracy: 58.26086956521739


In [71]:
save_MNB_classifier = open("pickled_algorithms/own_labeled/MNB.pickle","wb")
pickle.dump(MNB_classifier, save_MNB_classifier)
save_MNB_classifier.close()

In [58]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set_own)
print("BernoulliNB_classifier accuracy:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set_own))*100)

BernoulliNB_classifier accuracy: 56.52173913043478


In [72]:
save_BernoulliNB_classifier = open("pickled_algorithms/own_labeled/BernoulliNB.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_BernoulliNB_classifier)
save_BernoulliNB_classifier.close()

In [59]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set_own)
print("LogisticRegression_classifier accuracy:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set_own))*100)

LogisticRegression_classifier accuracy: 58.69565217391305




In [73]:
save_LogisticRegression_classifier = open("pickled_algorithms/own_labeled/LogisticRegression.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_LogisticRegression_classifier)
save_LogisticRegression_classifier.close()

In [60]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set_own)
print("SGDClassifier_classifier accuracy:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set_own))*100)



SGDClassifier_classifier accuracy: 54.78260869565217


In [74]:
save_SGDClassifier_classifier = open("pickled_algorithms/own_labeled/SGDClassifier.pickle","wb")
pickle.dump(SGDClassifier_classifier, save_SGDClassifier_classifier)
save_SGDClassifier_classifier.close()

In [61]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set_own)
print("SVC_classifier accuracy:", (nltk.classify.accuracy(SVC_classifier, testing_set_own))*100)



SVC_classifier accuracy: 46.95652173913044


In [75]:
save_SVC_classifier = open("pickled_algorithms/own_labeled/SVC.pickle","wb")
pickle.dump(SVC_classifier, save_SVC_classifier)
save_SVC_classifier.close()

In [62]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set_own)
print("LinearSVC_classifier accuracy:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set_own))*100)

LinearSVC_classifier accuracy: 60.43478260869565


In [76]:
save_LinearSVC_classifier = open("pickled_algorithms/own_labeled/LinearSVC.pickle","wb")
pickle.dump(LinearSVC_classifier, save_LinearSVC_classifier)
save_LinearSVC_classifier.close()

In [63]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set_own)
print("NuSVC_classifier accuracy:", (nltk.classify.accuracy(NuSVC_classifier, testing_set_own))*100)

NuSVC_classifier accuracy: 63.47826086956522


In [77]:
save_NuSVC_classifier = open("pickled_algorithms/own_labeled/NuSVC.pickle","wb")
pickle.dump(NuSVC_classifier, save_NuSVC_classifier)
save_NuSVC_classifier.close()

## Vote classifier

In [64]:
voted_classifier = VoteClassifier(classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LinearSVC_classifier,
                                  NuSVC_classifier,
                                  SGDClassifier_classifier,
                                  LogisticRegression_classifier)

In [65]:
print("voted_classifier accuracy:", (nltk.classify.accuracy(voted_classifier,testing_set_own))*100)

voted_classifier accuracy: 60.0
