Problem discription
--------------------

Apply the most compatible ML algorithm and build text classification model to predict article category



### Import libraries

In [19]:
# we compare two classifiers Multinomial NB and Logistic Regression
#first we import all the necessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

### Import datasets

In [20]:
df = pd.read_csv('./articles.csv', encoding='latin-1')

df.head()

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
0,d6995462-5e87-453b-b64d-e9f1df6e94d2,"A Puzzling Maneuver, Then Freefall: NTSB Repor...",,Essex Caller,<p>The helicopter that crashed in Southeast Al...,<p>The helicopter that crashed in Southeast Al...,Commercial,Negative
1,8b05e939-a89e-4548-b92b-013822e8ee7d,Bells Nexus Air Taxi Concept Rings Changes Fo...,,Aviation Week Network,<p>A year after teasing the fledgling electric...,<p>A year after teasing the fledgling electric...,Commercial,Positive
2,69fcd400-bceb-4255-8277-619f2d68ac0b,Bell Helicopter Show Air Taxi Nexus,http://images.tmtpost.com/uploads/images/2019/...,TMTPost,<p>Bell released the full-size design of the v...,<p>Bell released the full-size design of the v...,Commercial,Positive
3,17943578-c11b-414b-b3f5-063d3a93157b,BELL DÉVOILE LA CONCEPTION INTÉGRALE DE SON TA...,http://www.fredzone.org/wp-content/uploads/201...,Fredzone,<p>Bell est une soci&eacute;t&eacute; am&eacut...,<p>Bell est une soci&eacute;t&eacute; am&eacut...,Commercial,Positive
4,f33c7b11-5f77-4a98-bb2e-d36689042aea,Les premiers retours dOlivier Ezratty,,FrenchWeb,<p>It was still anecdotal to observe the explo...,<p>It was still anecdotal to observe the explo...,Commercial,Positive


In [21]:
# checking the dataset columns data types

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4305 entries, 0 to 4304
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Id                    4305 non-null   object
 1   Heading               4305 non-null   object
 2   Article.Banner.Image  1753 non-null   object
 3   Outlets               4305 non-null   object
 4   Article.Description   4305 non-null   object
 5   Full_Article          4305 non-null   object
 6   Article_Type          4305 non-null   object
 7   Tonality              3873 non-null   object
dtypes: object(8)
memory usage: 269.2+ KB


NOTE: 

based on above column data types, all columns we have object type only.

In [22]:
# checking dataset statistical information

df.describe()

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
count,4305,4305,1753,4305,4305,4305,4305,3873
unique,4305,4020,1686,1762,4291,4305,7,3
top,d6995462-5e87-453b-b64d-e9f1df6e94d2,Boeing CEO: First Operational Self-Flying Cars...,https://mmbiz.qpic.cn/mmbiz_jpg/myicofuNe4kzyY...,WeChat,In the intense contest to replace the Armys c...,<p>The helicopter that crashed in Southeast Al...,Commercial,Positive
freq,1,8,3,208,2,1,2470,3286


In [23]:
# check the dataset have any null values

df.isna().sum()

Id                         0
Heading                    0
Article.Banner.Image    2552
Outlets                    0
Article.Description        0
Full_Article               0
Article_Type               0
Tonality                 432
dtype: int64

NOTE:

Based on the above observation, we have two columns(Article.Banner.Image, Tonality) have some null values

In [24]:
df.head(1)

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
0,d6995462-5e87-453b-b64d-e9f1df6e94d2,"A Puzzling Maneuver, Then Freefall: NTSB Repor...",,Essex Caller,<p>The helicopter that crashed in Southeast Al...,<p>The helicopter that crashed in Southeast Al...,Commercial,Negative


In [25]:
df['Full_Article'][0]

'<p>The helicopter that crashed in Southeast Alaska in late September, killing three people, entered a 500-foot freefall before dropping to a Glacier Bay National Park beach, according to by the National Transportation Safety Board.&nbsp;The preliminary NTSB report released Friday offers no official probable cause. That determination won&lsquo;t be made until next year at the earliest.</p>'

In [26]:
import re

# for convient purpose we changed columns names

df['Full_Article'] = df['Full_Article'].astype(str).str.replace('<p>', '', regex=False).str.replace('</p>', '', regex=False)
df.rename({'Full_Article':'text', 'Article_Type':'category'}, axis=1, inplace=True)

In [27]:
df.columns

Index(['Id', 'Heading', 'Article.Banner.Image', 'Outlets',
       'Article.Description', 'text', 'category', 'Tonality'],
      dtype='object')

In [28]:
#we take text column in X which we will use to predict the category
#we take category column in Y which we will use to train our model as output using X
X = df['text']
X = np.array(X)
Y = df['category']
Y = np.array(Y)
#we vectorize text column using tfidf vectorizer.
#tfidf vectorizer works in two parts-tf and idf.
#tf is equal to number of occurences of the word in the document divided by total number of words present in that document
#idf is equal to log value of number of documents in which that word is present divided by total number of documents
#to read in detail about tfidf, kindly visit sklearn documentation pages
tfidf = TfidfVectorizer(lowercase=True, analyzer='word', stop_words='english', ngram_range=(1,3), use_idf=True)
X1 = tfidf.fit_transform(X)
n1 = LabelEncoder()
#we vectorize category column using labelencoder
#this will give a unique numerical value to every category
Y1 = n1.fit_transform(Y)
#we separate train and test datasets.
X_train, X_test, Y_train, Y_test = train_test_split(X1, Y1, test_size=0.3, random_state=0)
#print(X_train)
#print(Y_train)

In [29]:
#implementing multinomial NB
mnb = MultinomialNB()
mnb.fit(X_train, Y_train)
#print(mnb)
test = mnb.predict(X_test)
acc = 0
l = np.size(test, 0)
for i in range(l):
    acc = acc + abs(test[i]-Y_test[i])
print("Accuracy using Multinomial NB is: %f"%((1-acc/l)*100))

Accuracy using Multinomial NB is: 68.962848


In [30]:
#implementing Logistic Regression
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
lr.fit(X_train, Y_train)
#print(mnb)
test = lr.predict(X_test)
acc = 0
l = np.size(test, 0)
for i in range(l):
    acc = acc + abs(test[i]-Y_test[i])
print("Accuracy using Logistic Regression is: %f"%((1-acc/l)*100))



Accuracy using Logistic Regression is: 72.910217


In [31]:
import pickle

save_label = open("label_encoder.pickle", "wb")
pickle.dump(n1, save_label)
save_label.close()

save_label = open("label_encoder.pickle", "rb")
n1 = pickle.load(save_label)
save_label.close()

In [32]:
import pickle

save_tdidf = open("tdidf.pkl", "wb")
pickle.dump(tfidf, save_tdidf)
save_tdidf.close()

save_tdidf = open("tdidf.pkl", "rb")
tdidf = pickle.load(save_tdidf)
save_tdidf.close()

In [33]:
#now we will implement pickle
#we are using pickle to save our models
import pickle

save_classifier = open("naivebayes.pickle","wb")
pickle.dump(mnb, save_classifier)
save_classifier.close()

save_classifier = open("logistic.pickle","wb")
pickle.dump(lr, save_classifier)
save_classifier.close()
#here we saved both our models in naivebayes.pickle and logistic.pickle

In [34]:
#here we load naivebayes.pickle and logistic.pickle and use them
import pickle

classifier_f = open("naivebayes.pickle", "rb")
clf1 = pickle.load(classifier_f)
classifier_f.close()

classifier_f = open("logistic.pickle", "rb")
clf2 = pickle.load(classifier_f)
classifier_f.close()

In [35]:
import numpy as np

t = "Inmates in a Sri Lanka prison scuffled with guards and some tried to escape during a protest against new strict measures to control the spread of the coronavirus, leaving two dead and six wounded, police said Sunday."
t = np.array(t).reshape(-1,1)
t = tfidf.transform(t[0])
#print(t)
print(n1.inverse_transform(clf1.predict(t))[0])
print(n1.inverse_transform(clf2.predict(t))[0])

Commercial
Commercial
