# Naive Bayes Classification

&nbsp;

In [1]:
# Importing Libraries
import numpy as np
import pandas as pd

In [2]:
# Loading DataFrame

# Required Library
import pickle

file = "pklFiles/DATAFRAME.pkl"
fileobj = open(file, 'rb')
df = pickle.load(fileobj)
fileobj.close()

print(type(df))
df.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,TITLE,CATEGORY,ENCODED_CATEGORY
0,fed offici say weak data caus weather slow taper,Business,0
1,fed charl plosser see high bar chang pace taper,Business,0
2,us open stock fall fed offici hint acceler taper,Business,0
3,fed risk fall 'behind curv charl plosser say,Business,0
4,fed plosser nasti weather curb job growth,Business,0


In [3]:
# News Headlines
X = df['TITLE']

# Encoded News Category
y = df['ENCODED_CATEGORY']

In [4]:
# Splitting the dataset into Training set & Testing set

# Required Library
from sklearn.model_selection import train_test_split

# Testing_set = 25%  and  Training_set = 75%
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.25, random_state = 51)

In [5]:
print("Shape of X : " + str(X.shape))
print("Shape of y : " + str(y.shape))

print("\n")
print("Shape of X_train : " + str(X_train.shape))
print("Shape of y_train : " + str(y_train.shape))
print("Shape of X_test  : " + str(X_test.shape))
print("Shape of y_test  : " + str(y_test.shape))

Shape of X : (422419,)
Shape of y : (422419,)


Shape of X_train : (316814,)
Shape of y_train : (316814,)
Shape of X_test  : (105605,)
Shape of y_test  : (105605,)


&nbsp;

### Feature Selection :  TF-IDF Approach

In [6]:
# Feature Extraction

# Required Library
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiating TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fitting & Tranforming Training Data (X_train)
tfidf_X_train = tfidf_vectorizer.fit_transform(X_train.values)

# Tranforming Testing Data (X_test)
tfidf_X_test = tfidf_vectorizer.transform(X_test.values)

# Saving tfidf_vectorizer
pickle.dump(tfidf_vectorizer, open("pklFiles/tfidf_vectorizer.pkl","wb"))

In [7]:
# Multinomial Naive Bayes Classifier

# Required Library
from sklearn.naive_bayes import MultinomialNB

# Instantiating Naive Bayes Classifier with alpha = 1.0
nb_classifier = MultinomialNB()

# Fitting nb_classifier to Training Data
nb_classifier.fit(tfidf_X_train, y_train)

# Saving nb_classifier for tfidf_vectorizer
pickle.dump(nb_classifier, open("pklFiles/nb_classifier_for_tfidf_vectorizer.pkl", "wb"))

In [8]:
# Prediction
pred = nb_classifier.predict(tfidf_X_test)

In [9]:
# Accuracy Score & Confusion Matrix

# Required Library
from sklearn import metrics

print("Multinomial Naive Bayes : (TF-IDF Approach) \n")

# Accuracy
a_score = metrics.accuracy_score(y_test, pred)
print("Accuracy : " + str("{:.2f}".format(a_score*100)),'%')

print("\n")

# Confusion Matrix
# Labels : 0(Business), 1(Entertainment), 2(Health), 3(Science & Technology)
# By defualt, Horizontally, Labels are from 0 to 3
# By defualt, Vertically,   Labels are from 0 to 3
confusion_matrix = metrics.confusion_matrix(y_test, pred)

print("Confusion Matrix :")
print(confusion_matrix)

Multinomial Naive Bayes : (TF-IDF Approach) 

Accuracy : 92.08 %


Confusion Matrix :
[[26498   674   175  1844]
 [  530 36828   104   585]
 [  753   684  9588   305]
 [ 1910   690   113 24324]]


In [10]:
# Laplace Smoothing (Tunning parameter - alpha)

# List of alphas
alphas = np.arange(0,1,0.1)

# Function for training nb_classifier with different alpha values
def train_and_predict(alpha):
    
    # Instantiating Naive Bayes Classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    
    # Fitting nb_classifier to Training Data
    nb_classifier.fit(tfidf_X_train, y_train)
    
    # Prediction
    pred = nb_classifier.predict(tfidf_X_test)
    
    # Accuracy Score
    a_score = metrics.accuracy_score(y_test, pred)
    
    return a_score


# Iterating over alphas & printing the corresponding Accuracy Score
for alpha in alphas:
    print("Alpha : ", alpha)
    print("Accuracy Score : ", train_and_predict(alpha))
    print()

Alpha :  0.0
Accuracy Score :  0.9166043274466171

Alpha :  0.1
Accuracy Score :  0.9250508972113063

Alpha :  0.2


  'setting alpha = %.1e' % _ALPHA_MIN)


Accuracy Score :  0.9248615122390038

Alpha :  0.30000000000000004
Accuracy Score :  0.9243880498082477

Alpha :  0.4
Accuracy Score :  0.9239808721177974

Alpha :  0.5
Accuracy Score :  0.9234505941953506

Alpha :  0.6000000000000001
Accuracy Score :  0.9227404005492165

Alpha :  0.7000000000000001
Accuracy Score :  0.9221911841295394

Alpha :  0.8
Accuracy Score :  0.9218029449363193

Alpha :  0.9
Accuracy Score :  0.9212631977652573



With alpha = 1.0, we are getting accuracy of 92%.

Then, Trying different values of alpha,
Still we are getting approximate accuracy of 92%.

So, We don't need to change the value of alpha = 1.0

&nbsp;

### Feature Selection : Bag of Words (BOW) Approach

In [11]:
# Feature Extraction

# Required Library
from sklearn.feature_extraction.text import CountVectorizer

# Instantiating CountVectorizer
count_vectorizer = CountVectorizer()

# Fitting & Tranforming Training Data (X_train)
count_X_train = count_vectorizer.fit_transform(X_train.values)

# Tranforming Testing Data (X_test)
count_X_test = count_vectorizer.transform(X_test.values)

# Saving count_vectorizer
pickle.dump(count_vectorizer, open("pklFiles/count_vectorizer.pkl","wb"))

In [12]:
# Multinomial Naive Bayes Classifier

# Required Library
from sklearn.naive_bayes import MultinomialNB

# Instantiating Naive Bayes Classifier with alpha = 1.0
nb_classifier = MultinomialNB()

# Fitting nb_classifier to Training Data
nb_classifier.fit(count_X_train, y_train)

# Saving nb_classifier for count_vectorizer
pickle.dump(nb_classifier, open("pklFiles/nb_classifier_for_count_vectorizer.pkl", "wb"))

In [13]:
# Prediction
pred = nb_classifier.predict(count_X_test)

In [14]:
# Accuracy Score & Confusion Matrix

# Required Library
from sklearn import metrics

print("Multinomial Naive Bayes : (BOW Approach) \n")

# Accuracy
a_score = metrics.accuracy_score(y_test, pred)
print("Accuracy : " + str("{:.2f}".format(a_score*100)),'%')

print("\n")

# Confusion Matrix
# Labels : 0(Business), 1(Entertainment), 2(Health), 3(Science & Technology)
# By defualt, Horizontally, Labels are from 0 to 3
# By defualt, Vertically,   Labels are from 0 to 3
confusion_matrix = metrics.confusion_matrix(y_test, pred)

print("Confusion Matrix :")
print(confusion_matrix)

Multinomial Naive Bayes : (BOW Approach) 

Accuracy : 92.23 %


Confusion Matrix :
[[26271   556   421  1943]
 [  604 36433   303   707]
 [  460   364 10300   206]
 [ 1819   534   284 24400]]


In [15]:
# Laplace Smoothing (Tunning parameter - alpha)

# List of alphas
alphas = np.arange(0,1,0.1)

# Function for training nb_classifier with different alpha values
def train_and_predict(alpha):
    
    # Instantiating Naive Bayes Classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    
    # Fitting nb_classifier to Training Data
    nb_classifier.fit(count_X_train, y_train)
    
    # Prediction
    pred = nb_classifier.predict(count_X_test)
    
    # Accuracy Score
    a_score = metrics.accuracy_score(y_test, pred)
    
    return a_score


# Iterating over alphas & printing the corresponding Accuracy Score
for alpha in alphas:
    print("Alpha : ", alpha)
    print("Accuracy Score : ", train_and_predict(alpha))
    print()

Alpha :  0.0
Accuracy Score :  0.9182614459542635

Alpha :  0.1
Accuracy Score :  0.925107712702997

Alpha :  0.2


  'setting alpha = %.1e' % _ALPHA_MIN)


Accuracy Score :  0.9244353960513233

Alpha :  0.30000000000000004
Accuracy Score :  0.9241607878414848

Alpha :  0.4
Accuracy Score :  0.923791487145495

Alpha :  0.5
Accuracy Score :  0.9235074096870414

Alpha :  0.6000000000000001
Accuracy Score :  0.9233369632119691

Alpha :  0.7000000000000001
Accuracy Score :  0.9230528857535154

Alpha :  0.8
Accuracy Score :  0.9229392547701339

Alpha :  0.9
Accuracy Score :  0.922711992803371



With alpha = 1.0, we are getting accuracy of 92%

Then, Trying different values of alpha,
Still we are getting approximate accuracy of 92%.

So, We don't need to change the value of alpha = 1.0

&nbsp;

In [16]:
# Prediction of User News Headline

# Loading Model
count_vectorizer = pickle.load(open("pklFiles/count_vectorizer.pkl","rb"))
nb_classifier = pickle.load(open("pklFiles/nb_classifier_for_count_vectorizer.pkl","rb"))

In [17]:
# Values encoded by LabelEncoder
encoded = {0:'Business', 1:'Entertainment', 2:'Health', 3:'Science & Technology'}

# Input
user_headline = [input("News Headline : ")]

# Transformation & Prediction of User Headline
headline_counts = count_vectorizer.transform(user_headline)
prediction = nb_classifier.predict(headline_counts)

print("News Category : ", encoded[prediction[0]])

News Headline : Apple iPhone 12 likely to feature new LiDAR sensor.
News Category :  Science & Technology
