In [6]:
import pandas
import nltk
import random
#nltk.download('punkt') # this line to download utilities, do it for the first time only

path = '/content/Financial_News_Dataset.csv'
df = pandas.read_csv(path, encoding="ISO-8859-1", header=None)
documents = []
all_tok_words = []
dic_cnt = {}
dic_cnt['neutral']=dic_cnt['negative']=dic_cnt['positive']=0
for index, row in df.iterrows():
  category, description = row[0], row[1]
  dic_cnt[category] +=1
  if dic_cnt[category] < 601:#limiting the dataset to make it balanced
    tok_word = nltk.tokenize.word_tokenize(description)
    documents.append((tok_word, category))
    for word in tok_word:
      all_tok_words.append(word)

In [19]:
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in all_tok_words)
word_features = list(all_words)[:10000] # with more features, higher accuracy
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['{}'.format(word)] = (word in document_words)
    return features

In [20]:
# Train Naive Bayes classifier
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set_f, test_set_f = featuresets[200:], featuresets[:200]
classifier = nltk.NaiveBayesClassifier.train(train_set_f)

In [21]:
# Test the classifier
print('The accuracy is ',nltk.classify.accuracy(classifier, test_set_f)*100)

The accuracy is  78.5


In [22]:
classifier.show_most_informative_features(5)

Most Informative Features
                      mn = True           negati : neutra =     68.7 : 1.0
                    rose = True           positi : neutra =     54.9 : 1.0
                    down = True           negati : neutra =     49.1 : 1.0
               decreased = True           negati : neutra =     34.8 : 1.0
                    fell = True           negati : neutra =     26.1 : 1.0


In [23]:
#testing our model out of train data 
des_neg = 'the company fell down though they tried hard' #negative
des_tok_neg = nltk.tokenize.word_tokenize(des_neg)
des_pos = 'The company grew up though the situation was not good' #positive
des_tok_pos = nltk.tokenize.word_tokenize(des_pos)
print('the sentence given is negative and predicted as ',classifier.classify(document_features(des_tok_neg)))
print('the sentence given is positive and predicted as ',classifier.classify(document_features(des_tok_pos)))


the sentence given is negative and predicted as  negative
the sentence given is positive and predicted as  positive


In [24]:
from collections import defaultdict
labels_f = []
tests_f = []
for i, (feats, label) in enumerate(test_set_f):
    observed = classifier.classify(feats)
    labels_f.append(label)
    tests_f.append(observed)

In [25]:
print(nltk.ConfusionMatrix(labels_f, tests_f))

         |  n     p |
         |  e  n  o |
         |  g  e  s |
         |  a  u  i |
         |  t  t  t |
         |  i  r  i |
         |  v  a  v |
         |  e  l  e |
---------+----------+
negative |<53> 7  8 |
 neutral |  3<54> 3 |
positive |  5 17<50>|
---------+----------+
(row = reference; col = test)



In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

print('precision is ',np.mean(list(precision_score(labels_f, tests_f, average=None, labels=["positive", "negative", "neutral"]))))
print('recall is ',np.mean(list(recall_score(labels_f, tests_f, average=None, labels=["positive", "negative", "neutral"]))))
print('f1_score is ',np.mean(list(f1_score(labels_f, tests_f, average=None, labels=["positive", "negative", "neutral"]))))

precision is  0.8195603163159039
recall is  0.8177676226852642
f1_score is  0.8184520297506358
