In [185]:
import re
import pandas as pd
from nltk.corpus import stopwords

import copy
import numpy as np

import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
#from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [192]:
news = pd.read_excel("C:\\Users\\Kratika\\Downloads\\News_Growth.xlsx")
news.head()

Unnamed: 0,Title,Body,Category,MegaCategory
0,Cipla acquires USFDA approval for its hyperten...,Indian pharmaceutical and biotechnology conglo...,Approval - Alert me when a company gets an app...,Growth
1,Forest Labs gets US conditional approval for L...,Forest Labs gets US conditional approval for L...,Approval - Alert me when a company gets an app...,Growth
2,Axis Bank plans to focus on rural markets,Private sector lender Axis Bank is focussing o...,Growth/Focus in exisitng terrain - Alert me wh...,Growth
3,"Bajaj sports new brand identity, eyes bigger m...",Having seen significant success in overseas ma...,Growth/Focus in exisitng terrain - Alert me wh...,Growth
4,Dr Reddys sees US prices stabilising soon,Dr Reddys sees US prices stabilising soon,Growth/Focus in exisitng terrain - Alert me wh...,Growth


In [193]:
from nltk.tokenize import word_tokenize 

def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    stop_words.union(['january','february','march','april','may','june','july','august','september','october','november','december'])
    stop_words.union(['jan','feb','mar','apr','may','jun','jul','aug','sept','oct','nov','dec'])
    stop_words.union(['monday', 'tuesday', 'wednesday', 'thursday','friday','saturday','sunday'])
    stop_words.union(['am', 'pm'])
    word_tokens = word_tokenize(data) 
    filtered_sentence = []
    
    for w in word_tokens: 
        if w.lower() not in stop_words: 
            filtered_sentence.append(w) 
    
    return " ".join(filtered_sentence)

In [194]:
news['text'] = news.Title + " " + news.Body

In [195]:
# import these modules 
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
print("shareholders :", lemmatizer.lemmatize("shareholder")) 
print("stocks :", lemmatizer.lemmatize("stock")) 
print("companys :", lemmatizer.lemmatize("company")) 
print("trading :", lemmatizer.lemmatize("trade")) 
print("weekly :", lemmatizer.lemmatize("week")) 
print("billion:", lemmatizer.lemmatize("billionaire")) 
print("high :", lemmatizer.lemmatize("higher")) 
print("close :", lemmatizer.lemmatize("closeed")) 
print("finance :", lemmatizer.lemmatize("financial")) 
print("biggest:", lemmatizer.lemmatize("high")) 
print("unit:", lemmatizer.lemmatize("units")) 
print("protect:", lemmatizer.lemmatize("protection")) 
print("organisation:", lemmatizer.lemmatize("organised")) 
print("'company:", lemmatizer.lemmatize("'companies")) 
# a denotes adjective in "pos" 
print("better :", lemmatizer.lemmatize("better", pos ="a"))

shareholders : shareholder
stocks : stock
companys : company
trading : trade
weekly : week
billion: billionaire
high : higher
close : closeed
finance : financial
biggest: high
unit: unit
protect: protection
organisation: organised
'company: 'companies
better : good


In [196]:
 news['text'].apply(lemmatizer.lemmatize)

0      Cipla acquires USFDA approval for its hyperten...
1      Forest Labs gets US conditional approval for L...
2      Axis Bank plans to focus on rural markets Priv...
3      Bajaj sports new brand identity, eyes bigger m...
4      Dr Reddys sees US prices stabilising soon Dr R...
5      Wipro Launches Edge Artificial Intelligence So...
6      Wipro unveils new AI software for global enter...
7      Wipro +2.8% after AWS team-up launching a "co-...
8      Wipro to outshine previous performance: Azim P...
9      Yes Bank shares continue rising spree, surge n...
10     Tech Mahindra, IIT Kanpur collaborate to addre...
11     Tech Mahindra, IIT Kanpur collaborate to addre...
12     Tech Mahindra, IIT Kanpur sign MoU to conduct ...
13     India: Giant Tech Mahindra To Cooperate With S...
14     Samsung SDS, Tech Mahindra To Collaborate On N...
15     Ultratech Cement bags Deora-Sitapuri limestone...
16     NDRF, Wipro to work together in design for add...
17     Wipro bags 7-year contra

In [197]:

news['clean_title'] = news['Title'].apply(remove_stop_words)
news['clean_body'] = news['Body'].apply(remove_stop_words)

news['clean_text'] = news['text'].apply(remove_stop_words) 

#remove_stop_words_Body


In [198]:
# ## Split data
print("\nSplitting data")

title_tr, title_te, MegaCategory_tr, MegaCategory_te = train_test_split(news['clean_text'], news.Category,test_size =.1)
title_tr, title_de, MegaCategory_tr , MegaCategory_de = train_test_split(title_tr,MegaCategory_tr,test_size =.1)


print("Training: ",len(title_tr))

print("Testing: ",len(title_te))


Splitting data
Training:  239
Testing:  30


In [199]:
from sklearn.preprocessing import LabelEncoder

# # Data Preprocessing
# ## Vectorization of data
# Vectorize the data using Bag of words (BOW)
print("\nVectorizing data")
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
stop_words = nltk.corpus.stopwords.words("english")
vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)

vectorizer.fit(iter(title_tr))
Xtr = vectorizer.transform(iter(title_tr))
Xde = vectorizer.transform(iter(title_de))
Xte = vectorizer.transform(iter(title_te))

encoder = LabelEncoder()
encoder.fit(MegaCategory_tr)
Ytr = encoder.transform(MegaCategory_tr)
Yde = encoder.transform(MegaCategory_de)
Yte = encoder.transform(MegaCategory_te)


Vectorizing data


In [200]:
print(Ytr)

[ 4  5 10  6  9  4  9  8  1  4  9  4  6  9  9  6  9  6  4  6  6 10  5  4
  8  5  2  6 11  6  4  9  4  4  2  4 13  1  6  5  6  6  4 13 10  9  4  4
  6  6  6  8  4  5  6  4  4  9  2 13  5  4 10  8  6 10  6  8  4  5  5  5
  4  6  4  9  9 12  9  4  6  2  9  9  9  5  0  6  4  3  6  4  2  6  4  5
  5  4  5  4  9  9  8  3  6  9 12  1 10  4  9  6  4  4  4  5  9  9 10 12
 11 10 10  9 13  4  4  4  5 10  8  8  4  6  6 12  8  4  6 10  2  2  4  6
 10  4  2  1  8  6 10  4  6  8 10  8  9  9  8  0  6  4  6  4  5  4  6  6
  4  4 10  6 10  9  9  5  4 10  6  6  4  6  6  6  7  5 13  3  6  6  5  4
 10  1  6  4  4  6  6  2  4 10  6 12  6 13  1  1  4  2 10  4  9 13  9 13
  5  9  4  6  2  6  6  9  4  8  4  6  6  2  4  9  4  8  4 13  9  6  4]


In [201]:
Xtr

<239x4084 sparse matrix of type '<class 'numpy.int64'>'
	with 11203 stored elements in Compressed Sparse Row format>

In [202]:
# # Train Models
# ### Baseline Model
# “stratified”: generates predictions by respecting the training set’s class distribution.
print("\n\nTraining baseline classifier")
dc = DummyClassifier(strategy="stratified")
dc.fit(Xtr, Ytr)
dc_pred = dc.predict(Xde)



Training baseline classifier


In [203]:
print(classification_report(Yde, dc_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(dc_pred == Yde)))

                                                                                                                                    precision    recall  f1-score   support

                                                                               Approval - Alert me when a company gets an approval       0.25      0.10      0.14        10
                Growth/Focus in exisitng terrain - Alert me when a company focusses on existing terrain by means of expansion, etc       0.50      0.20      0.29         5
                                               New Geography- Alert me when a company moves or expands into a new state or country       0.00      0.00      0.00         1
     New Patent or Regulatory Approval- Alert me when a company receives government approval for a patent, registration or license       0.00      0.00      0.00         1
New Products, Initiatives, or Strategy- Alert me when a company unveils a new product or announces an initiative to enter a market       0.

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [204]:

# ### Decision Tree
print("Training Decision tree")
dt = DecisionTreeClassifier()
dt.fit(Xtr, Ytr)
dt_pred = dt.predict(Xde)
print(classification_report(Yde, dt_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(dt_pred == Yde)))

Training Decision tree
                                                                                                                                    precision    recall  f1-score   support

                                                                               Approval - Alert me when a company gets an approval       0.50      0.50      0.50        10
                Growth/Focus in exisitng terrain - Alert me when a company focusses on existing terrain by means of expansion, etc       1.00      0.20      0.33         5
                                               New Geography- Alert me when a company moves or expands into a new state or country       0.33      1.00      0.50         1
     New Patent or Regulatory Approval- Alert me when a company receives government approval for a patent, registration or license       0.00      0.00      0.00         1
New Products, Initiatives, or Strategy- Alert me when a company unveils a new product or announces an initiative to 

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [205]:
# ### Random Forest
print("Training Random Forest")
rf = RandomForestClassifier(n_estimators=40)
rf.fit(Xtr, Ytr)
pred = rf.predict(Xde)
print(classification_report(Yde, pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(pred == Yde)))

Training Random Forest
                                                                                                                                    precision    recall  f1-score   support

                                                                               Approval - Alert me when a company gets an approval       0.50      0.90      0.64        10
                Growth/Focus in exisitng terrain - Alert me when a company focusses on existing terrain by means of expansion, etc       1.00      0.20      0.33         5
                                               New Geography- Alert me when a company moves or expands into a new state or country       0.00      0.00      0.00         1
     New Patent or Regulatory Approval- Alert me when a company receives government approval for a patent, registration or license       0.00      0.00      0.00         1
New Products, Initiatives, or Strategy- Alert me when a company unveils a new product or announces an initiative to 

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [206]:

# ### Multinomial Naive Bayesian
print("Training Multinomial Naive Bayesian")
nb = MultinomialNB()
nb.fit(Xtr, Ytr)
pred_nb = nb.predict(Xde)
print(classification_report(Yde, pred_nb, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(pred_nb == Yde)))

Training Multinomial Naive Bayesian
                                                                                                                                    precision    recall  f1-score   support

                                                                               Approval - Alert me when a company gets an approval       0.70      0.70      0.70        10
                Growth/Focus in exisitng terrain - Alert me when a company focusses on existing terrain by means of expansion, etc       1.00      0.40      0.57         5
                                               New Geography- Alert me when a company moves or expands into a new state or country       0.00      0.00      0.00         1
     New Patent or Regulatory Approval- Alert me when a company receives government approval for a patent, registration or license       0.33      1.00      0.50         1
New Products, Initiatives, or Strategy- Alert me when a company unveils a new product or announces an i

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [207]:
# ### Support Vector Classification
print("Training Support Vector Classification")
from sklearn.svm import SVC
svc = SVC()
svc.fit(Xtr, Ytr)
svc_pred = svc.predict(Xde)
print(classification_report(Yde, svc_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(svc_pred == Yde)))

Training Support Vector Classification




                                                                                                                                    precision    recall  f1-score   support

                                                                               Approval - Alert me when a company gets an approval       0.37      1.00      0.54        10
                Growth/Focus in exisitng terrain - Alert me when a company focusses on existing terrain by means of expansion, etc       0.00      0.00      0.00         5
                                               New Geography- Alert me when a company moves or expands into a new state or country       0.00      0.00      0.00         1
     New Patent or Regulatory Approval- Alert me when a company receives government approval for a patent, registration or license       0.00      0.00      0.00         1
New Products, Initiatives, or Strategy- Alert me when a company unveils a new product or announces an initiative to enter a market       0.

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [208]:
# ### Multilayered Perceptron
print("Training Multilayered Perceptron")
mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1, max_iter=400)
mlp.fit(Xtr, Ytr)
mlp_pred = mlp.predict(Xde)
print(classification_report(Yde, mlp_pred, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(np.mean(mlp_pred == Yde)))

Training Multilayered Perceptron
                                                                                                                                    precision    recall  f1-score   support

                                                                               Approval - Alert me when a company gets an approval       0.58      0.70      0.64        10
                Growth/Focus in exisitng terrain - Alert me when a company focusses on existing terrain by means of expansion, etc       1.00      0.20      0.33         5
                                               New Geography- Alert me when a company moves or expands into a new state or country       0.00      0.00      0.00         1
     New Patent or Regulatory Approval- Alert me when a company receives government approval for a patent, registration or license       0.50      1.00      0.67         1
New Products, Initiatives, or Strategy- Alert me when a company unveils a new product or announces an init

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [214]:
print('Accuracy achieved is ' + str(np.mean(dc_pred   == Yde)))
print('Accuracy achieved is ' + str(np.mean(dt_pred   == Yde)))
print('Accuracy achieved is ' + str(np.mean(pred_nb   == Yde)))
print('Accuracy achieved is ' + str(np.mean(svc_pred  == Yde)))
print('Accuracy achieved is ' + str(np.mean(mlp_pred  == Yde)))

Accuracy achieved is 0.14814814814814814
Accuracy achieved is 0.4444444444444444
Accuracy achieved is 0.5555555555555556
Accuracy achieved is 0.37037037037037035
Accuracy achieved is 0.5555555555555556


In [215]:
# # Final Model: Multilayered Perceptron
# ## Predict test data
print("\n\nPredicting test data using Multilayered Perceptron")
pred_final = mlp.predict(Xte)
print(classification_report(Yte, pred_final, labels=Ytr ,target_names=encoder.classes_))
print('Accuracy achieved is ' + str(accuracy_score(Yte,pred_final)))



Predicting test data using Multilayered Perceptron
                                                                                                                                    precision    recall  f1-score   support

                                                                               Approval - Alert me when a company gets an approval       0.77      0.91      0.83        11
                Growth/Focus in exisitng terrain - Alert me when a company focusses on existing terrain by means of expansion, etc       1.00      0.50      0.67         2
                                               New Geography- Alert me when a company moves or expands into a new state or country       0.00      0.00      0.00         0
     New Patent or Regulatory Approval- Alert me when a company receives government approval for a patent, registration or license       0.17      0.25      0.20         4
New Products, Initiatives, or Strategy- Alert me when a company unveils a new product 

In [216]:
output = {"title":vectorizer.inverse_transform(Xte), "predicted":encoder.inverse_transform(pred_final),"actual": encoder.inverse_transform(Yte)}

In [158]:
#df = pd.DataFrame(output, columns=["title","predicted","actual"])
#df.to_csv("C:\\Users\\Kratika\\Downloads\\News333_title_predication_1.csv")

In [217]:
pred_final = mlp.predict(Xte)
print('Accuracy achieved is ' + str(np.mean(pred_final == Yte)))
output = {"text":vectorizer.inverse_transform(Xte), "predicted":encoder.inverse_transform(pred_final),"actual": encoder.inverse_transform(Yte)}
df = pd.DataFrame(output, columns=["text","predicted","actual"])


Accuracy achieved is 0.6333333333333333


In [58]:
df = pd.DataFrame(output, columns=["text","predicted","actual"])
df.to_csv("C:\\Users\\Kratika\\Downloads\\248_catagoryNews333_title_body_predication.csv")