In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from collections import *
import time

nltk.download('stopwords')
nltk.download('punkt')
start_time = time.time()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
data = pd.read_csv('rt_Reviews.csv', encoding='windows-1252')
# Convert the Freshness column to binary labels
data['Freshness'] = (data['Freshness'] == 'fresh').astype(int)
# print(data.head())
# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)
# Split the data into train, dev, and test sets
train_data = data[:288000]
# print(train_data.Freshness.value_counts())
# print(train_data)
dev_data = data[280000:384000]
test_data = data[384000:]
# print(train_data.shape)
x_train = train_data['Review']
y_train = train_data['Freshness']
x_dev = dev_data['Review']
y_dev = dev_data['Freshness']
x_test = test_data['Review']
y_test = test_data['Freshness']

# print(x_train)

#bbbb


def get_vocabulary(dataset):
    vocab_counts = defaultdict(int)
    for review in dataset['Review']:
        tokens = nltk.word_tokenize(review)
        tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
        for token in tokens:
            vocab_counts[token] += 1
    vocab = [word for word, count in vocab_counts.items() if count >= 5]
    word_to_index = {word: i for i, word in enumerate(vocab)}
    return word_to_index




train_vocab = get_vocabulary(train_data)
# print(train_vocab)

# # Calculate the probability of each word occurring in a review
def calc_word_occurrence_probability(word, dataset):
    # Count the number of documents containing the given word
    num_documents_with_word = sum(1 for review in dataset['Review'] if word in review.lower().split())

    # Calculate the probability of occurrence of the word
    total_num_documents = len(dataset)
    p_word = num_documents_with_word / total_num_documents

    return p_word
p_good_train = calc_word_occurrence_probability("the", train_data)
# print("Probability of occurrence of 'good' in train dataset is {}".format(p_good_train*100))


def calc_word_sentiment_probability(word, sentiment, dataset):
    # Count the number of documents containing the given word and sentiment
    num_documents_with_word_and_sentiment = sum(1 for i, review in dataset.iterrows() if word in review['Review'].lower().split() and review['Freshness'] == sentiment)
    # print(num_documents_with_word_and_sentiment)

    # Count the number of documents containing the sentiment
    num_documents_with_sentiment = sum(1 for i, review in dataset.iterrows() if review['Freshness'] == sentiment)
    # print(num_documents_with_sentiment)


    # Calculate the probability of occurrence of the word given the sentiment
    p_word_given_sentiment = num_documents_with_word_and_sentiment / num_documents_with_sentiment

    return p_word_given_sentiment
p_the_positive_train = calc_word_sentiment_probability("good", 1, train_data)
print("Probability of 'the' given a positive sentiment in train dataset is {}".format(p_the_positive_train*100))
num_documents_with_sentiment = train_data['Freshness'].sum()
p_positive = num_documents_with_sentiment / len(train_data)
p_negative = 1 - p_positive



def predict_review_sentiment(review, word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative):
    # Convert the review to a list of tokens
    tokens = nltk.word_tokenize(review)

    # Stem and remove stop words from the tokens
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]

    # Calculate the log probabilities of the review belonging to each class
    log_p_positive = np.log(p_positive)
    log_p_negative = np.log(p_negative)

    # Sum the log probabilities of each word given each class
    for token in stemmed_tokens:
        if token in word_to_index:
            token_index = word_to_index[token]
            log_p_positive += np.log(positive_word_probs[token_index])
            log_p_negative += np.log(negative_word_probs[token_index])

    # Predict the sentiment with the higher probability
    if log_p_positive > log_p_negative:
        return 1
    else:
        return 0


def calculate_accuracy(dataset, word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative):
    num_correct = 0
    for i, review in dataset.iterrows():
        predicted_sentiment = predict_review_sentiment(review['Review'], word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative)
        if predicted_sentiment == review['Freshness']:
            num_correct += 1

    accuracy = num_correct / len(dataset)
    return accuracy


def train_naive_bayes(x_train, y_train):
    # Calculate the vocabulary of the training set
    word_to_index = get_vocabulary(train_data)

    # Calculate the probability of each word occurring in a review
    positive_word_counts = np.zeros(len(word_to_index))
    negative_word_counts = np.zeros(len(word_to_index))

    for i, review in train_data.iterrows():
        tokens = nltk.word_tokenize(review['Review'])
        tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
        for token in tokens:
            if token in word_to_index:
                token_index = word_to_index[token]
                if review['Freshness'] == 1:
                    positive_word_counts[token_index] += 1
                else:
                    negative_word_counts[token_index] += 1

    # Calculate the probability of each word given a positive or negative sentiment
    num_documents_with_sentiment = train_data['Freshness'].sum()
    p_positive = num_documents_with_sentiment / len(train_data)
    p_negative = 1 - p_positive

    positive_word_probs = (positive_word_counts + 1) / (positive_word_counts.sum() + len(word_to_index))
    negative_word_probs = (negative_word_counts + 1) / (negative_word_counts.sum() + len(word_to_index))
    # print('Positive word probabilities: {}'.format(positive_word_probs))
    # print('Negative word probabilities: {}'.format(negative_word_probs))

    return word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative


# Train the Naive Bayes classifier
word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative = train_naive_bayes(x_train, y_train)

# Calculate the accuracy of the classifier on the development set
accuracy = calculate_accuracy(dev_data, word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative)
print('Accuracy on dev set: {:.2%}'.format(accuracy))

def top_words(word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative):
    top_positive_words = sorted(word_to_index.keys(), key=lambda w: positive_word_probs[word_to_index[w]] * p_positive / (positive_word_probs[word_to_index[w]] * p_positive + negative_word_probs[word_to_index[w]] * p_negative), reverse=True)[:10]
    top_negative_words = sorted(word_to_index.keys(), key=lambda w: negative_word_probs[word_to_index[w]] * p_negative / (positive_word_probs[word_to_index[w]] * p_positive + negative_word_probs[word_to_index[w]] * p_negative), reverse=True)[:10]

    return top_positive_words, top_negative_words

top_positive_words, top_negative_words = top_words(word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative)

print("Top 10 words predicting positive class:")
print(top_positive_words)

print("Top 10 words predicting negative class:")
print(top_negative_words)


def smoothings(x_train, y_train, smoothing):
    # Count the number of positive and negative examples in the training set
    n_positive = sum(1 for label in y_train if label == 1)
    n_negative = len(y_train) - n_positive

    # Calculate the prior probabilities P(positive) and P(negative)
    p_positive = n_positive / len(y_train)
    p_negative = 1 - p_positive

    # Count the number of occurrences of each word in positive and negative examples
    positive_word_counts = defaultdict(int)
    negative_word_counts = defaultdict(int)
    for words, label in zip(x_train, y_train):
        for word in words:
            if label == 1:
                positive_word_counts[word] += 1
            else:
                negative_word_counts[word] += 1

    # Create a dictionary that maps words to their index in the probability lists
    all_words = set(positive_word_counts.keys()) | set(negative_word_counts.keys())
    word_to_index = {word: i for i, word in enumerate(all_words)}

    # Calculate the probability of each word given the positive and negative classes
    num_words = len(all_words)
    positive_word_probs = [(positive_word_counts[word] + smoothing) / (n_positive + smoothing * num_words) for word in
                           all_words]
    negative_word_probs = [(negative_word_counts[word] + smoothing) / (n_negative + smoothing * num_words) for word in
                           all_words]

    return word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative


# Train a Naive Bayes classifier with Laplace smoothing
word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative = smoothings(x_train, y_train,
                                                                                             smoothing=1)

# Calculate the accuracy of the classifier on the development set
accuracy = calculate_accuracy(dev_data, word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative)
print('Accuracy on dev set with smoothing=1: {:.2%}'.format(accuracy))

# Train a Naive Bayes classifier with more aggressive smoothing
word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative = smoothings(x_train, y_train,
                                                                                             smoothing=10)

# Calculate the accuracy of the classifier on the development set
accuracy = calculate_accuracy(dev_data, word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative)
print('Accuracy on dev set with smoothing=10: {:.2%}'.format(accuracy))

# Train a Naive Bayes classifier with optimal hyperparameters
word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative = smoothings(x_train, y_train,
                                                                                             smoothing=0.5)
accuracy = calculate_accuracy(dev_data, word_to_index, positive_word_probs, negative_word_probs, p_positive, p_negative)
print('Accuracy on dev set with smoothing=0.5: {:.2%}'.format(accuracy))

# Calculate the accuracy of the classifier on the test set
accuracy = calculate_accuracy(test_data, word_to_index, positive_word_probs, negative_word_probs, p_positive,
                              p_negative)
print('Accuracy on test set with smoothing=1: {:.2%}'.format(accuracy))

end_time = time.time()
print('Time taken: {:.2f} seconds'.format(end_time - start_time))