In [137]:
import re
import pandas as pd
from nltk.corpus import stopwords

import copy
import numpy as np

import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
#from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [138]:
news = pd.read_excel("C:\\Users\\Kratika\\Downloads\\News333.xlsx")
news.head()

Unnamed: 0,Title,Body,Category,MegaCategory
0,BS-VI transition may lead to dumping of old st...,BS-VI transition may lead to dumping of old st...,Annual Report - Comments made in the Annual Re...,Financial
1,Annual Report 2016-2017 of Bajaj Finserv Limited,It is a broadly described annual report of Baj...,Annual Report - Comments made in the Annual Re...,Financial
2,Annual Report 2017-2018 of Eicher Motors Limit...,It is a broadly described annual report of\nEi...,Annual Report - Comments made in the Annual Re...,Financial
3,Hindalco Industries : Chairman Kumar Mangalam ...,(You can enter multiple email addresses separa...,Annual Report - Comments made in the Annual Re...,Financial
4,"WIPRO continues it's uptrend, although on a we...","WIPRO continues it's uptrend, although on a we...",Annual Report - Comments made in the Annual Re...,Financial


In [139]:
from nltk.tokenize import word_tokenize 

def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    stop_words.union(['january','february','march','april','may','june','july','august','september','october','november','december'])
    stop_words.union(['jan','feb','mar','apr','may','jun','jul','aug','sept','oct','nov','dec'])
    stop_words.union(['monday', 'tuesday', 'wednesday', 'thursday','friday','saturday','sunday'])
    stop_words.union(['am', 'pm'])
    word_tokens = word_tokenize(data) 
    filtered_sentence = []
    
    for w in word_tokens: 
        if w.lower() not in stop_words: 
            filtered_sentence.append(w) 
    
    return " ".join(filtered_sentence)

In [140]:
news['text'] = news.Title + " " + news.Body

In [141]:
# import these modules 
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
print("shareholders :", lemmatizer.lemmatize("shareholder")) 
print("stocks :", lemmatizer.lemmatize("stock")) 
print("companys :", lemmatizer.lemmatize("company")) 
print("trading :", lemmatizer.lemmatize("trade")) 
print("weekly :", lemmatizer.lemmatize("week")) 
print("billion:", lemmatizer.lemmatize("billionaire")) 
print("high :", lemmatizer.lemmatize("higher")) 
print("close :", lemmatizer.lemmatize("closeed")) 
print("finance :", lemmatizer.lemmatize("financial")) 
print("biggest:", lemmatizer.lemmatize("high")) 
print("unit:", lemmatizer.lemmatize("units")) 
print("protect:", lemmatizer.lemmatize("protection")) 
print("organisation:", lemmatizer.lemmatize("organised")) 
print("'company:", lemmatizer.lemmatize("'companies")) 
# a denotes adjective in "pos" 
print("better :", lemmatizer.lemmatize("better", pos ="a"))

shareholders : shareholder
stocks : stock
companys : company
trading : trade
weekly : week
billion: billionaire
high : higher
close : closeed
finance : financial
biggest: high
unit: unit
protect: protection
organisation: organised
'company: 'companies
better : good


In [142]:
 news['text'].apply(lemmatizer.lemmatize)

0       BS-VI transition may lead to dumping of old st...
1       Annual Report 2016-2017 of Bajaj Finserv Limit...
2       Annual Report 2017-2018 of Eicher Motors Limit...
3       Hindalco Industries : Chairman Kumar Mangalam ...
4       WIPRO continues it's uptrend, although on a we...
5       Cipla acquires USFDA approval for its hyperten...
6       Forest Labs gets US conditional approval for L...
7       Tata Steel Sports Department Training Centre C...
8       TCS Wins 2019 Pega Partner Award for Excellenc...
9       HUL bags the ET Corporate Citizen Award The aw...
10      Massive fire breaks out at Asian Paints manufa...
11      BPCL refinery fire doused but tension continue...
12      BPCL blaze extinguished; sleepless night for l...
13      Explosion in BPCL plant in Chembur; 43 injured...
14      45 injured as fire breaks out at BPCL refinery...
15      Mumbai: Fire breaks out at BPCL refinery in Ch...
16      $500K needed to investigate and fix Britannia ...
17      Man ho

In [143]:

news['clean_title'] = news['Title'].apply(remove_stop_words)
news['clean_body'] = news['Body'].apply(remove_stop_words)

news['clean_text'] = news['text'].apply(remove_stop_words) 

#remove_stop_words_Body


In [144]:
# ## Split data
print("\nSplitting data")

title_tr, title_te, MegaCategory_tr, MegaCategory_te = train_test_split(news['clean_text'], news.Category,test_size =.1)
title_tr, title_de, MegaCategory_tr , MegaCategory_de = train_test_split(title_tr,MegaCategory_tr,test_size =.1)


print("Training: ",len(title_tr))

print("Testing: ",len(title_te))


Splitting data
Training:  2007
Testing:  248


In [145]:
from sklearn.preprocessing import LabelEncoder

# # Data Preprocessing
# ## Vectorization of data
# Vectorize the data using Bag of words (BOW)
print("\nVectorizing data")
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
stop_words = nltk.corpus.stopwords.words("english")
vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)

vectorizer.fit(iter(title_tr))
Xtr = vectorizer.transform(iter(title_tr))
Xde = vectorizer.transform(iter(title_de))
Xte = vectorizer.transform(iter(title_te))

encoder = LabelEncoder()
encoder.fit(MegaCategory_tr)
Ytr = encoder.transform(MegaCategory_tr)
Yde = encoder.transform(MegaCategory_de)
Yte = encoder.transform(MegaCategory_te)


Vectorizing data


In [146]:
print(Ytr)

[23 23 23 ... 23 36 26]


In [147]:
Xtr

<2007x15826 sparse matrix of type '<class 'numpy.int64'>'
	with 104628 stored elements in Compressed Sparse Row format>

In [148]:
# # Train Models
# ### Baseline Model
# “stratified”: generates predictions by respecting the training set’s class distribution.
print("\n\nTraining baseline classifier")
dc = DummyClassifier(strategy="stratified")
dc.fit(Xtr, Ytr)
dc_pred = dc.predict(Xde)



Training baseline classifier


In [149]:
print(classification_report(Yde, dc_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(dc_pred == Yde)))

                                                                                                                                    precision    recall  f1-score   support

                                                                                Annual Report - Comments made in the Annual Report       0.25      0.18      0.21        76
                                                                               Approval - Alert me when a company gets an approval       0.25      0.18      0.21        76
                                                            Award Received- Alert me when any of these companies receives an award       0.25      0.18      0.21        76
                                                             Black Swan Events - Alert me when a company faces an exceptional loss       0.25      0.18      0.21        76
                                                     Buyback of shares- Alert me when any of these companies buys back it's shares       0.

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [150]:

# ### Decision Tree
print("Training Decision tree")
dt = DecisionTreeClassifier()
dt.fit(Xtr, Ytr)
dt_pred = dt.predict(Xde)
print(classification_report(Yde, dt_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(dt_pred == Yde)))

Training Decision tree
                                                                                                                                    precision    recall  f1-score   support

                                                                                Annual Report - Comments made in the Annual Report       0.64      0.66      0.65        76
                                                                               Approval - Alert me when a company gets an approval       0.64      0.66      0.65        76
                                                            Award Received- Alert me when any of these companies receives an award       0.64      0.66      0.65        76
                                                             Black Swan Events - Alert me when a company faces an exceptional loss       0.64      0.66      0.65        76
                                                     Buyback of shares- Alert me when any of these companies buys ba

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [151]:
# ### Random Forest
print("Training Random Forest")
rf = RandomForestClassifier(n_estimators=40)
rf.fit(Xtr, Ytr)
pred = rf.predict(Xde)
print(classification_report(Yde, pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(pred == Yde)))

Training Random Forest
                                                                                                                                    precision    recall  f1-score   support

                                                                                Annual Report - Comments made in the Annual Report       0.51      0.93      0.66        76
                                                                               Approval - Alert me when a company gets an approval       0.51      0.93      0.66        76
                                                            Award Received- Alert me when any of these companies receives an award       0.51      0.93      0.66        76
                                                             Black Swan Events - Alert me when a company faces an exceptional loss       0.51      0.93      0.66        76
                                                     Buyback of shares- Alert me when any of these companies buys ba

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [152]:

# ### Multinomial Naive Bayesian
print("Training Multinomial Naive Bayesian")
nb = MultinomialNB()
nb.fit(Xtr, Ytr)
pred_nb = nb.predict(Xde)
print(classification_report(Yde, pred_nb, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(pred_nb == Yde)))

  .format(len(labels), len(target_names))


Training Multinomial Naive Bayesian
                                                                                                                                    precision    recall  f1-score   support

                                                                                Annual Report - Comments made in the Annual Report       0.65      0.86      0.74        76
                                                                               Approval - Alert me when a company gets an approval       0.65      0.86      0.74        76
                                                            Award Received- Alert me when any of these companies receives an award       0.65      0.86      0.74        76
                                                             Black Swan Events - Alert me when a company faces an exceptional loss       0.65      0.86      0.74        76
                                                     Buyback of shares- Alert me when any of these comp


Accuracy achieved is 0.5381165919282511


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [153]:
# ### Support Vector Classification
print("Training Support Vector Classification")
from sklearn.svm import SVC
svc = SVC()
svc.fit(Xtr, Ytr)
svc_pred = svc.predict(Xde)
print(classification_report(Yde, svc_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(svc_pred == Yde)))

Training Support Vector Classification




                                                                                                                                    precision    recall  f1-score   support

                                                                                Annual Report - Comments made in the Annual Report       0.35      1.00      0.52        76
                                                                               Approval - Alert me when a company gets an approval       0.35      1.00      0.52        76
                                                            Award Received- Alert me when any of these companies receives an award       0.35      1.00      0.52        76
                                                             Black Swan Events - Alert me when a company faces an exceptional loss       0.35      1.00      0.52        76
                                                     Buyback of shares- Alert me when any of these companies buys back it's shares       0.

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [154]:
# ### Multilayered Perceptron
print("Training Multilayered Perceptron")
mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1, max_iter=400)
mlp.fit(Xtr, Ytr)
mlp_pred = mlp.predict(Xde)
print(classification_report(Yde, mlp_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(mlp_pred == Yde)))

Training Multilayered Perceptron
                                                                                                                                    precision    recall  f1-score   support

                                                                                Annual Report - Comments made in the Annual Report       0.73      0.88      0.80        76
                                                                               Approval - Alert me when a company gets an approval       0.73      0.88      0.80        76
                                                            Award Received- Alert me when any of these companies receives an award       0.73      0.88      0.80        76
                                                             Black Swan Events - Alert me when a company faces an exceptional loss       0.73      0.88      0.80        76
                                                     Buyback of shares- Alert me when any of these compani

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [155]:
print('Accuracy achieved is ' + str(np.mean(dc_pred   == Yde)))
print('Accuracy achieved is ' + str(np.mean(dt_pred   == Yde)))
print('Accuracy achieved is ' + str(np.mean(pred_nb   == Yde)))
print('Accuracy achieved is ' + str(np.mean(svc_pred  == Yde)))
print('Accuracy achieved is ' + str(np.mean(mlp_pred  == Yde)))

Accuracy achieved is 0.09417040358744394
Accuracy achieved is 0.4349775784753363
Accuracy achieved is 0.5381165919282511
Accuracy achieved is 0.3542600896860987
Accuracy achieved is 0.6188340807174888


In [156]:
# # Final Model: Multilayered Perceptron
# ## Predict test data
print("\n\nPredicting test data using Multilayered Perceptron")
pred_final = mlp.predict(Xte)
print(classification_report(Yte, pred_final, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(accuracy_score(Yte,pred_final)))



Predicting test data using Multilayered Perceptron
                                                                                                                                    precision    recall  f1-score   support

                                                                                Annual Report - Comments made in the Annual Report       0.68      0.91      0.78        86
                                                                               Approval - Alert me when a company gets an approval       0.68      0.91      0.78        86
                                                            Award Received- Alert me when any of these companies receives an award       0.68      0.91      0.78        86
                                                             Black Swan Events - Alert me when a company faces an exceptional loss       0.68      0.91      0.78        86
                                                     Buyback of shares- Alert me when 

Accuracy achieved is 0.6209677419354839


In [157]:
output = {"title":vectorizer.inverse_transform(Xte), "predicted":encoder.inverse_transform(pred_final),"actual": encoder.inverse_transform(Yte)}

In [158]:
#df = pd.DataFrame(output, columns=["title","predicted","actual"])
#df.to_csv("C:\\Users\\Kratika\\Downloads\\News333_title_predication_1.csv")

In [159]:
pred_final = mlp.predict(Xte)
print('Accuracy achieved is ' + str(np.mean(pred_final == Yte)))
output = {"text":vectorizer.inverse_transform(Xte), "predicted":encoder.inverse_transform(pred_final),"actual": encoder.inverse_transform(Yte)}
df = pd.DataFrame(output, columns=["text","predicted","actual"])


Accuracy achieved is 0.6209677419354839


In [58]:
df = pd.DataFrame(output, columns=["text","predicted","actual"])
df.to_csv("C:\\Users\\Kratika\\Downloads\\248_catagoryNews333_title_body_predication.csv")