# Simple Naive Bayes Classifier

In [None]:
# Step 1: Load all the components we need to build this classifier

import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [None]:
# Step 2: Read the data file we have. It consists of two fields

tweets = pd.read_csv('c:/Users/ken/Downloads/car_reviews.csv', header = 0, encoding = 'latin-1')

tweets[:3]

# The tweets could do with some clean up such as removing punctuation, capitalisation, stop words. And stemming or lemmatisation. 

In [None]:
# Step 3: Extract the fields

tweet_text, tweet_labels = tweets.TweetText, tweets.Sentiment

In [None]:
# Step 4: Set up the natural language toolkit tokenizer and the counter

# It has a tokeniser designed for tweets.

tokenizer = nltk.casual.TweetTokenizer(preserve_case=False, reduce_len=True)

# We will count the number of each word in the corpus and set frequency limits to reduce overall size.

vectoriser = CountVectorizer(tokenizer=tokenizer.tokenize, min_df = 0.007, max_df = 0.991)

In [None]:
# Step 5: Build the corpus

corpus = tweet_text.tolist() #we should really only use the training data and ignore new words in the test data

In [None]:
# Step 6: Apply the counter/vectoriser

tweet_fitted = vectoriser.fit_transform(corpus)

features = len(vectoriser.get_feature_names())

print('Number of features', features)
print(tweet_fitted.toarray()[2])

In [None]:
# Step 7: Split the input data into training and test data. 80%/20%

train_tweets, test_tweets, train_labels, test_labels = train_test_split(
    tweet_fitted, tweet_labels, random_state=48746, test_size = 0.2)

totaltweets = train_labels.count()
positives = train_labels.sum()
negatives = totaltweets - positives


print('p(1) =', positives/totaltweets)
print('p(0) =', negatives/totaltweets)


In [None]:
# Step 8: Create the model

model = MultinomialNB(fit_prior=True)

# fit the model using the training data's labels

model.fit(train_tweets, train_labels)

In [None]:
# Step 9: Apply the model

test_pred = model.predict(test_tweets)

In [None]:
# Step 10: Get the confusion matrix of the predictions vs the ground truth of the test data.

conf_mat = confusion_matrix(test_labels, test_pred)

print(conf_mat)

In [None]:
# Step 11: Sum the true predictions and divide by total predictions to calculate precentage correct.

true_neg, false_pos, false_neg, true_pos = conf_mat.ravel()

total_tweets = (true_neg + false_pos + false_neg + true_pos)

correctly_predicted = ((true_pos + true_neg) / total_tweets) * 100

print(correctly_predicted, '% correctly predicted')