In [12]:
import re
import pandas as pd
from nltk.corpus import stopwords

import numpy as np

import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
#from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [13]:
news = pd.read_excel("C:\\Users\\Kratika\\Downloads\\new_title.xlsx")
news.head()

Unnamed: 0,Title,Body,Category,MegaCategory,New_Title
0,Cipla acquires USFDA approval for its hyperten...,Indian pharmaceutical and biotechnology conglo...,Approval - Alert me when a company gets an app...,Growth,Others
1,Forest Labs gets US conditional approval for L...,Forest Labs gets US conditional approval for L...,Approval - Alert me when a company gets an app...,Growth,Others
2,Axis Bank plans to focus on rural markets,Private sector lender Axis Bank is focussing o...,Growth/Focus in exisitng terrain - Alert me wh...,Growth,Others
3,"Bajaj sports new brand identity, eyes bigger m...",Having seen significant success in overseas ma...,Growth/Focus in exisitng terrain - Alert me wh...,Growth,Others
4,Dr Reddys sees US prices stabilising soon,Dr Reddys sees US prices stabilising soon,Growth/Focus in exisitng terrain - Alert me wh...,Growth,Others


In [14]:
from nltk.tokenize import word_tokenize 

def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    stop_words.union(['january','february','march','april','may','june','july','august','september','october','november','december'])
    stop_words.union(['jan','feb','mar','apr','may','jun','jul','aug','sept','oct','nov','dec'])
    stop_words.union(['monday', 'tuesday', 'wednesday', 'thursday','friday','saturday','sunday'])
    stop_words.union(['am', 'pm'])
    word_tokens = word_tokenize(data) 
    filtered_sentence = []
    
    for w in word_tokens: 
        if w.lower() not in stop_words: 
            filtered_sentence.append(w) 
    
    return " ".join(filtered_sentence)

In [37]:
news['text'] = news.Title + " " + news.Body

In [38]:
# import these modules 
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
print("shareholders :", lemmatizer.lemmatize("shareholder")) 
print("stocks :", lemmatizer.lemmatize("stock")) 
print("companys :", lemmatizer.lemmatize("company")) 
print("trading :", lemmatizer.lemmatize("trade")) 
print("weekly :", lemmatizer.lemmatize("week")) 
print("billion:", lemmatizer.lemmatize("billionaire")) 
print("high :", lemmatizer.lemmatize("higher")) 
print("close :", lemmatizer.lemmatize("closeed")) 
print("finance :", lemmatizer.lemmatize("financial")) 
print("biggest:", lemmatizer.lemmatize("high")) 
print("unit:", lemmatizer.lemmatize("units")) 
print("protect:", lemmatizer.lemmatize("protection")) 
print("organisation:", lemmatizer.lemmatize("organised")) 
print("'company:", lemmatizer.lemmatize("'companies")) 
# a denotes adjective in "pos" 
print("better :", lemmatizer.lemmatize("better", pos ="a"))

shareholders : shareholder
stocks : stock
companys : company
trading : trade
weekly : week
billion: billionaire
high : higher
close : closeed
finance : financial
biggest: high
unit: unit
protect: protection
organisation: organised
'company: 'companies
better : good


In [39]:
 news['text'].apply(lemmatizer.lemmatize)

0      Cipla acquires USFDA approval for its hyperten...
1      Forest Labs gets US conditional approval for L...
2      Axis Bank plans to focus on rural markets Priv...
3      Bajaj sports new brand identity, eyes bigger m...
4      Dr Reddys sees US prices stabilising soon Dr R...
5      Wipro Launches Edge Artificial Intelligence So...
6      Wipro unveils new AI software for global enter...
7      Wipro +2.8% after AWS team-up launching a "co-...
8      Wipro to outshine previous performance: Azim P...
9      Yes Bank shares continue rising spree, surge n...
10     Adani Group To Invest Rs 55,000 Cr In Projects...
11     Reliance Jio, Bharti Airtel to bid for Relianc...
12     bpcl to invest rs 1 500 1 700 crore in floatin...
13     BPCL to invest Rs 1,500-1,700 cr in floating L...
14     BPCL to invest up to Rs1,700 crore in building...
15     BPCL to set up bioethanol plant in Bargarh BPC...
16     BPCL Awaits Green Signal for Numaligarh Refine...
17     BPCL gets green nod for 

In [40]:

news['clean_title'] = news['Title'].apply(remove_stop_words)
news['clean_body'] = news['Body'].apply(remove_stop_words)

news['clean_text'] = news['text'].apply(remove_stop_words) 

#remove_stop_words_Body


In [41]:
# ## Split data
print("\nSplitting data")

title_tr, title_te, MegaCategory_tr, MegaCategory_te = train_test_split(news['clean_text'], news.New_Title,test_size =.1)
title_tr, title_de, MegaCategory_tr , MegaCategory_de = train_test_split(title_tr,MegaCategory_tr,test_size =.1)


print("Training: ",len(title_tr))

print("Testing: ",len(title_te))


Splitting data
Training:  187
Testing:  24


In [42]:
from sklearn.preprocessing import LabelEncoder

# # Data Preprocessing
# ## Vectorization of data
# Vectorize the data using Bag of words (BOW)
print("\nVectorizing data")
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
stop_words = nltk.corpus.stopwords.words("english")
vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)

vectorizer.fit(iter(title_tr))
Xtr = vectorizer.transform(iter(title_tr))
Xde = vectorizer.transform(iter(title_de))
Xte = vectorizer.transform(iter(title_te))

encoder = LabelEncoder()
encoder.fit(MegaCategory_tr)
Ytr = encoder.transform(MegaCategory_tr)
Yde = encoder.transform(MegaCategory_de)
Yte = encoder.transform(MegaCategory_te)


Vectorizing data


In [43]:
print(Ytr)

[2 0 0 0 2 2 2 0 2 0 3 0 2 3 2 2 2 2 1 2 2 2 0 2 2 2 0 0 0 2 2 2 2 0 2 2 2
 0 3 2 0 0 2 2 1 0 1 3 0 0 2 2 1 2 2 0 2 1 0 1 2 0 2 0 3 1 2 2 2 1 0 0 2 2
 2 0 2 3 0 2 2 0 2 0 2 2 2 2 2 2 3 0 2 0 0 0 2 0 0 0 0 2 2 2 2 2 1 0 2 2 0
 2 0 0 1 2 0 2 0 3 0 2 2 2 2 2 0 2 2 2 3 3 2 0 2 1 2 3 2 1 2 0 2 2 2 2 2 0
 0 2 2 2 2 0 0 2 2 0 2 2 2 2 0 2 2 2 2 2 1 2 0 3 2 2 2 2 0 0 3 3 2 2 2 0 0
 2 2]


In [44]:
Xtr

<187x3688 sparse matrix of type '<class 'numpy.int64'>'
	with 9378 stored elements in Compressed Sparse Row format>

In [45]:
# # Train Models
# ### Baseline Model
# “stratified”: generates predictions by respecting the training set’s class distribution.
print("\n\nTraining baseline classifier")
dc = DummyClassifier(strategy="stratified")
dc.fit(Xtr, Ytr)

dc_pred = dc.predict(Xde)



Training baseline classifier


In [46]:
print(classification_report(Yde, dc_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(dc_pred == Yde)))

                                    precision    recall  f1-score   support

Investment and Growth related News       0.18      0.25      0.21         8
                            Others       0.25      0.40      0.31         5
Products, Initiatives or Strategy        0.25      0.40      0.31         5
   Sales, User or Geography Growth       0.25      0.40      0.31         5

                          accuracy                           0.23      1222
                         macro avg       0.18      0.26      0.21      1222
                      weighted avg       0.18      0.26      0.21      1222

Accuracy achieved is 0.19047619047619047


  .format(len(labels), len(target_names))


In [47]:

# ### Decision Tree
print("Training Decision tree")
dt = DecisionTreeClassifier()
dt.fit(Xtr, Ytr)
dt_pred = dt.predict(Xde)
print(classification_report(Yde, dt_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(dt_pred == Yde)))

Training Decision tree
                                    precision    recall  f1-score   support

Investment and Growth related News       0.73      1.00      0.84         8
                            Others       0.71      1.00      0.83         5
Products, Initiatives or Strategy        0.71      1.00      0.83         5
   Sales, User or Geography Growth       0.71      1.00      0.83         5

                          accuracy                           0.82      1222
                         macro avg       0.76      0.92      0.80      1222
                      weighted avg       0.75      0.94      0.81      1222

Accuracy achieved is 0.7619047619047619


  .format(len(labels), len(target_names))


In [48]:
# ### Random Forest
print("Training Random Forest")
rf = RandomForestClassifier(n_estimators=40)
rf.fit(Xtr, Ytr)
pred = rf.predict(Xde)
print(classification_report(Yde, pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(pred == Yde)))

Training Random Forest
                                    precision    recall  f1-score   support

Investment and Growth related News       0.57      1.00      0.73         8
                            Others       0.71      1.00      0.83         5
Products, Initiatives or Strategy        0.71      1.00      0.83         5
   Sales, User or Geography Growth       0.71      1.00      0.83         5

                          accuracy                           0.72      1222
                         macro avg       0.53      0.86      0.65      1222
                      weighted avg       0.55      0.91      0.69      1222

Accuracy achieved is 0.6190476190476191


  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


In [49]:

# ### Multinomial Naive Bayesian
print("Training Multinomial Naive Bayesian")
nb = MultinomialNB()
nb.fit(Xtr, Ytr)
pred_nb = nb.predict(Xde)
print(classification_report(Yde, pred_nb, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(pred_nb == Yde)))

Training Multinomial Naive Bayesian
                                    precision    recall  f1-score   support

Investment and Growth related News       0.56      0.62      0.59         8
                            Others       0.45      1.00      0.62         5
Products, Initiatives or Strategy        0.45      1.00      0.62         5
   Sales, User or Geography Growth       0.45      1.00      0.62         5

                          accuracy                           0.58      1222
                         macro avg       0.52      0.66      0.54      1222
                      weighted avg       0.55      0.67      0.56      1222

Accuracy achieved is 0.5238095238095238


  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


In [50]:
# ### Support Vector Classification
print("Training Support Vector Classification")
from sklearn.svm import SVC
svc = SVC()
svc.fit(Xtr, Ytr)
svc_pred = svc.predict(Xde)
print(classification_report(Yde, svc_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(svc_pred == Yde)))

Training Support Vector Classification
                                    precision    recall  f1-score   support

Investment and Growth related News       0.38      1.00      0.55         8
                            Others       0.00      0.00      0.00         5
Products, Initiatives or Strategy        0.00      0.00      0.00         5
   Sales, User or Geography Growth       0.00      0.00      0.00         5

                          accuracy                           0.49      1222
                         macro avg       0.21      0.56      0.31      1222
                      weighted avg       0.26      0.68      0.38      1222

Accuracy achieved is 0.38095238095238093


  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


In [51]:
# ### Multilayered Perceptron
print("Training Multilayered Perceptron")
mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1, max_iter=400)
mlp.fit(Xtr, Ytr)
mlp_pred = mlp.predict(Xde)
print(classification_report(Yde, mlp_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(mlp_pred == Yde)))

Training Multilayered Perceptron
                                    precision    recall  f1-score   support

Investment and Growth related News       0.54      0.88      0.67         8
                            Others       0.71      1.00      0.83         5
Products, Initiatives or Strategy        0.71      1.00      0.83         5
   Sales, User or Geography Growth       0.71      1.00      0.83         5

                          accuracy                           0.69      1222
                         macro avg       0.59      0.80      0.64      1222
                      weighted avg       0.60      0.84      0.66      1222

Accuracy achieved is 0.6190476190476191


  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


In [52]:
print('Accuracy achieved is ' + str(np.mean(dc_pred   == Yde)))
print('Accuracy achieved is ' + str(np.mean(dt_pred   == Yde)))
print('Accuracy achieved is ' + str(np.mean(pred_nb   == Yde)))
print('Accuracy achieved is ' + str(np.mean(svc_pred  == Yde)))
print('Accuracy achieved is ' + str(np.mean(mlp_pred  == Yde)))

Accuracy achieved is 0.19047619047619047
Accuracy achieved is 0.7619047619047619
Accuracy achieved is 0.5238095238095238
Accuracy achieved is 0.38095238095238093
Accuracy achieved is 0.6190476190476191


In [53]:
# # Final Model: Multilayered Perceptron
# ## Predict test data
print("\n\nPredicting test data using Multilayered Perceptron")
pred_final = mlp.predict(Xte)
print(classification_report(Yte, pred_final, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(accuracy_score(Yte,pred_final)))



Predicting test data using Multilayered Perceptron
                                    precision    recall  f1-score   support

Investment and Growth related News       0.75      1.00      0.86        12
                            Others       1.00      0.80      0.89        10
Products, Initiatives or Strategy        1.00      0.80      0.89        10
   Sales, User or Geography Growth       1.00      0.80      0.89        10

                          accuracy                           0.86      1835
                         macro avg       0.72      0.80      0.74      1835
                      weighted avg       0.82      0.92      0.85      1835

Accuracy achieved is 0.8333333333333334


In [54]:
output = {"title":vectorizer.inverse_transform(Xte), "predicted":encoder.inverse_transform(pred_final),"actual": encoder.inverse_transform(Yte)}

In [55]:
#df = pd.DataFrame(output, columns=["title","predicted","actual"])
#df.to_csv("C:\\Users\\Kratika\\Downloads\\News333_title_predication_1.csv")

In [56]:
pred_final = mlp.predict(Xte)
print('Accuracy achieved is ' + str(np.mean(pred_final == Yte)))
output = {"text":vectorizer.inverse_transform(Xte), "predicted":encoder.inverse_transform(pred_final),"actual": encoder.inverse_transform(Yte)}
df = pd.DataFrame(output, columns=["text","predicted","actual"])


Accuracy achieved is 0.8333333333333334


In [57]:
df = pd.DataFrame(output, columns=["text","predicted","actual"])
df.to_csv("C:\\Users\\Kratika\\Downloads\\old.csv")