# Gaussian NB

In [1]:
import numpy as np
from sklearn import datasets, metrics
from sklearn.naive_bayes import GaussianNB

In [2]:
# loading iris dataset
# it contains sepal length, height and petal length, height
# categorised into 3 different flowers

X,y = datasets.load_iris(return_X_y=True)
print(X.shape)

(150, 4)


In [3]:
X_train = X[range(0,150,2),:]
y_train = y[range(0,150,2)]

X_test = X[range(1,150,2),:]
y_test = y[range(1,150,2)]

In [5]:
# apply gaussian NB as data is continous numeric
clr = GaussianNB()
clr.fit(X_train,y_train)

pred = clr.predict(X_test)

print("Accuracy proportion: ", metrics.accuracy_score(y_test,pred))
print("Report: ", metrics.classification_report(y_test,pred))
print("Confusion Matrix: \n", metrics.confusion_matrix(y_test,pred))

Accuracy proportion:  0.96
Report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00        25
           1       0.92      0.96      0.94        25
           2       0.96      0.92      0.94        25

    accuracy                           0.96        75
   macro avg       0.96      0.96      0.96        75
weighted avg       0.96      0.96      0.96        75

Confusion Matrix: 
 [[25  0  0]
 [ 0 24  1]
 [ 0  2 23]]


# Multinomial NB

In [1]:
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [5]:
# I prepared a csv by taking 50 different news headings from google
# categorized them into (technology, politics, sports) for ease
# now for each news heading we can vectorise words present in it to classify the type

with open("news_classification.csv", 'r') as f:
    data = list(csv.reader(f, delimiter=","))
    
data = np.array(data)

X_train = data[1:46,0]
y_train = data[1:46,1]
X_test = data[46:,0]
y_test = data[46:,1]

In [6]:
# CountVectorizer used to convert text into word frequency features
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Multinomial Naive Bayes
classifier = MultinomialNB()
classifier.fit(X_train_counts, y_train)

# Make predictions
pred = classifier.predict(X_test_counts)

print("Accuracy proportion: ", metrics.accuracy_score(y_test,pred))
print("Report: ", metrics.classification_report(y_test,pred))
print("Confusion Matrix: \n", metrics.confusion_matrix(y_test,pred))

Accuracy proportion:  0.8
Report:                precision    recall  f1-score   support

    politics       0.50      1.00      0.67         1
      sports       1.00      0.50      0.67         2
  technology       1.00      1.00      1.00         2

    accuracy                           0.80         5
   macro avg       0.83      0.83      0.78         5
weighted avg       0.90      0.80      0.80         5

Confusion Matrix: 
 [[1 0 0]
 [1 1 0]
 [0 0 2]]


In [8]:
heading = input("Enter a news heading to classify its type: ")

test_counts = vectorizer.transform([heading])
pred = classifier.predict(test_counts)
print("Label: ",pred[0])

Enter a news heading to classify its type: Basketball Team Wins Overtime Thriller
Label:  sports


# Bernoulli NB

In [1]:
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics

In [7]:
# took 50 one liner tweets and labelelled them
with open("sentiment_analysis.csv", 'r') as f:
    data = list(csv.reader(f, delimiter=","))

data = np.array(data)

X_train = data[1:40,0]
y_train = data[1:40,1]
X_test = data[40:,0]
y_test = data[40:,1]

In [8]:
# CountVectorizer used to convert text into binary presence/absence features
vectorizer = CountVectorizer(binary=True)
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Bernoulli Naive Bayes
classifier = BernoulliNB()
classifier.fit(X_train_counts, y_train)

# Make predictions
pred = classifier.predict(X_test_counts)

print("Accuracy proportion: ", metrics.accuracy_score(y_test,pred))
print("Report: ", metrics.classification_report(y_test,pred))
print("Confusion Matrix: \n", metrics.confusion_matrix(y_test,pred))

Accuracy proportion:  0.8
Report:                precision    recall  f1-score   support

    negative       1.00      0.60      0.75         5
    positive       0.71      1.00      0.83         5

    accuracy                           0.80        10
   macro avg       0.86      0.80      0.79        10
weighted avg       0.86      0.80      0.79        10

Confusion Matrix: 
 [[3 2]
 [0 5]]


In [10]:
tweet = input("Enter a tweet and find its sentiment: ")

test_counts = vectorizer.transform([tweet])
pred = classifier.predict(test_counts)
print("Label: ",pred[0])

Enter a tweet and find its sentiment: This constant heat is ruining my mood
Label:  negative
