# Machine Learning Model

In [104]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize as wt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [105]:
dataset = pd.read_csv(r'HeadlineClassification.csv', encoding='ISO-8859-1')

In [106]:
stemmer = PorterStemmer()
spell = SpellChecker()
data = []
print("No. of rows ", dataset.shape[0])

No. of rows  99


In [107]:
for i in range(dataset.shape[0]):
    headline = dataset.iloc[i, 0] 
    # removing all symbols other than alphabets
    headline = re.sub('[^A-Za-z]', ' ', headline)
    headline = headline.lower()
    tokenized_headline = wt(headline)  # word tokenization

    # stopword removal and stemming
    headline_processed = []
    for word in tokenized_headline:
        if word not in set(stopwords.words('english')):
            headline_processed.append(spell.correction((stemmer.stem(word))))

    headline_text = " ".join(headline_processed)
    # data is a list, where each list variable containing sentence
    data.append(headline_text)

In [108]:
# creating the feature matrix
matrix = CountVectorizer(max_features=1000)
#matrix = CountVectorizer()
# input data used for learning and building BOW model
X = matrix.fit_transform(data).toarray()
y = dataset.iloc[:, 1]  # filters all rows : from 1th column "output labels"

In [109]:
# split train and test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,random_state=0)  # from sklearn.model_selection

Bernoulli Naive Bayes Classification

In [113]:
classifier = BernoulliNB()
classifier.fit(X_train, y_train)

BernoulliNB()

In [114]:
# predict class
y_pred = classifier.predict(X_test)
print("test data result y_pred \n", y_pred)

test data result y_pred 
 ['Left' 'Left' 'Left' 'Right' 'Left' 'Left' 'Left' 'Right' 'Left' 'Right'
 'Left' 'Right' 'Left' 'Left' 'Left' 'Right' 'Left' 'Left' 'Left' 'Left'
 'Left' 'Right' 'Left' 'Left' 'Left']


In [115]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix \n", cm)
cr = classification_report(y_test, y_pred)
print("Classificaiton Report is \n", cr)
accuracy = accuracy_score(y_test, y_pred)
print("last accuracy ", accuracy)
# Testing with Actual data
myvector = matrix.transform(["BJP remark on opposition shows its arrogance:BSP"]).toarray()
y_result = classifier.predict(myvector)
print("y_result ", y_result)

Confusion Matrix 
 [[12  1]
 [ 7  5]]
Classificaiton Report is 
               precision    recall  f1-score   support

        Left       0.63      0.92      0.75        13
       Right       0.83      0.42      0.56        12

    accuracy                           0.68        25
   macro avg       0.73      0.67      0.65        25
weighted avg       0.73      0.68      0.66        25

last accuracy  0.68
y_result  ['Left']


Multinomial Naive Bayes Classification

In [116]:
# Naive Bayes
# classifier = BernoulliNB() # if X data is majorly binary
# classifier = MultinomialNB()# For multiclass classificaiton
# suitable if data is continuous and assumed to be on Gaussian distribution
classifier = MultinomialNB()
classifier.fit(X_train, y_train)  # model training

MultinomialNB()

In [117]:
# predict class
y_pred = classifier.predict(X_test)
print("test data result y_pred \n", y_pred)

test data result y_pred 
 ['Left' 'Left' 'Left' 'Right' 'Left' 'Left' 'Left' 'Right' 'Left' 'Right'
 'Left' 'Right' 'Left' 'Right' 'Left' 'Right' 'Left' 'Left' 'Left' 'Left'
 'Left' 'Right' 'Left' 'Left' 'Left']


In [112]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix \n", cm)
cr = classification_report(y_test, y_pred)
print("Classificaiton Report is \n", cr)
accuracy = accuracy_score(y_test, y_pred)
print("last accuracy ", accuracy)
# Testing with Actual data
myvector = matrix.transform(["Despite US Finding Proof of Bribery in Indian Railways, Modi Government Has Taken No Action"]).toarray()
y_result = classifier.predict(myvector)
print("y_result ", y_result)

Confusion Matrix 
 [[11  2]
 [ 7  5]]
Classificaiton Report is 
               precision    recall  f1-score   support

        Left       0.61      0.85      0.71        13
       Right       0.71      0.42      0.53        12

    accuracy                           0.64        25
   macro avg       0.66      0.63      0.62        25
weighted avg       0.66      0.64      0.62        25

last accuracy  0.64
y_result  ['Right']


# Deep Learning Model

In [35]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [36]:
data = pd.read_csv(r'HeadlineClassification.csv', encoding='ISO-8859-1')

In [43]:
df = data[["Headline", "Leaning"]]

In [44]:
df["Leaning"].value_counts()
leaning_label = df.Leaning.factorize()
print(leaning_label)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64), Index(['Left', 'Right'], dtype='object'))


In [45]:
message = df.Headline.values
print(message)

['Police In Rajasthan Town Accused Of Thrashing Man In Custody'
 'Centre Withdraws Offer To Sell 53% Stake In Bharat Petroleum Corp'
 'Afghan Man With Medicines Worth ? 64 Lakh Detained At Delhi Airport'
 'After Texas School Shooting, Ex FBI Agent Says "Ballistic Blankets" Should Be Installed On Walls'
 'PM Arrives In Chennai To Warm Welcome, Will Launch Infra Projects'
 'Bhagwant Mann Meets British Envoy, Discusses Tie-Ups In Education, IT'
 "2 Pakistani Fishermen Detained, 4 Fishing Boats Seized In Gujarat's Bhuj"
 'Only 58% School Teachers Took Part In Discussions On NEP, 65% Overloaded With Work: Education Ministry Survey'
 'Gyanvapi Case: District Court To Continue Hearing On Maintainability On Monday'
 "Money Is Making Some Nations Tolerant To Russia's Aggression: Ukraine's Zelensky"
 "Key Accused In Jammu-Kashmir's Sunjwan Attack Arrested: Probe Agency"
 "Delhi's New Civic Body Appoints 3 Deputy Commissioners, 22 Zonal Heads"
 'High Court Permits JNU Student Sharjeel Imam To App

In [46]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(message)
vocab_size = len(tokenizer.word_index) + 1

In [47]:
encoded_docs = tokenizer.texts_to_sequences(message)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)
print(padded_sequence)

[[  0   0   0 ...  81   2 194]
 [  0   0   0 ...  43 200 201]
 [  0   0   0 ...  11  21 207]
 ...
 [  0   0   0 ...   1 901 902]
 [  0   0   0 ...  65  66 912]
 [  0   0   0 ... 917  95  13]]


In [48]:
model = Sequential()
model.add(Dense(50, input_dim=200, activation='tanh'))
model.add(Dense(30, activation="tanh"))
model.add(Dense(20, activation="tanh"))
model.add(Dense(10, activation="tanh"))
model.add(Dense(5, activation="tanh"))
model.add(Dense(2, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))

In [49]:
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 50)                10050     
                                                                 
 dense_16 (Dense)            (None, 30)                1530      
                                                                 
 dense_17 (Dense)            (None, 20)                620       
                                                                 
 dense_18 (Dense)            (None, 10)                210       
                                                                 
 dense_19 (Dense)            (None, 5)                 55        
                                                                 
 dense_20 (Dense)            (None, 2)                 12        
                                                                 
 dense_21 (Dense)            (None, 1)                

In [67]:
history = model.fit(
    padded_sequence, ham_label[0], validation_split=0.2, epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [68]:
def predict_sentiment(text):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw, maxlen=200)
    prediction = int(model.predict(tw).round().item())
    print("Predicted ideology: ", ham_label[1][prediction])

In [75]:
test_headline = "Gujarat Govt Launches 'Aatmanirbhar Gujarat' Scheme To Boost Manufacturing Sector In State"
predict_sentiment(test_headline)

Predicted ideology:  Left


In [78]:
test_headline = "BJP upbeat on J&K polls, banks on success of welfare schemes"
predict_sentiment(test_headline)

Predicted ideology:  Right


In [80]:
test_headline = "US President Joe Biden Pardons All Convicted of Marijuana Possession"
predict_sentiment(test_headline)

Predicted ideology:  Left


In [85]:
test_headline = "Despite US Finding Proof of Bribery in Indian Railways, Modi Government Has Taken No Action"
predict_sentiment(test_headline)

Predicted ideology:  Left


In [91]:
test_headline = "BJP remark on opposition shows its arrogance:BSP"
predict_sentiment(test_headline)

Predicted ideology:  Right


In [103]:
test_headline = "BJP worker shot dead inside showroom in Gurugram"
predict_sentiment(test_headline)

Predicted ideology:  Left
