# Naive Bayes with MultinomialNB classifier

In [None]:
# Importing the required libraries
#pip install nltk

import re
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('words')
nltk.download('movie_reviews')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import movie_reviews

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Define a function to check if a word is an English word
def is_english_word(word):
    return word.lower() in english_words

# Define a set of English words
english_words = set(words.words())

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Modify the preprocess_text function to use the WordNet Lemmatizer for all categories
def preprocess_text(text):
    # Remove non-alphanumeric characters (excluding spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenization
    words = word_tokenize(text.lower())  # Convert to lowercase and tokenize

    # Lemmatize all words using NLTK's WordNet lemmatizer
    cleaned_words = []
    for word in words:
        lemma = lemmatizer.lemmatize(word)
        if lemma.isalpha() and is_english_word(lemma):
            cleaned_words.append(lemma)

    return ' '.join(cleaned_words)

# movie_reviews dataset
positive_reviews = movie_reviews.fileids('pos')
negative_reviews = movie_reviews.fileids('neg')

# Combine positive and negative reviews with preprocessing (including cleaning)
all_reviews = [(preprocess_text(movie_reviews.raw(fileid)), 'pos') for fileid in positive_reviews] + \
              [(preprocess_text(movie_reviews.raw(fileid)), 'neg') for fileid in negative_reviews]


In [None]:
# Splitting the data into training and testing sets
train_reviews, test_reviews = train_test_split(all_reviews, test_size=0.2, random_state=42)

# Initialize the vectorizer and classifier
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform([text for text, _ in train_reviews])  # Convert text data to numerical features
X_test = vectorizer.transform([text for text, _ in test_reviews])  # Transform test data using the same vectorizer

# Get the vocabulary from the vectorizer
vocabulary = vectorizer.get_feature_names_out()

# Extract true labels for training and testing data (pos/neg)
y_train = [label for _, label in train_reviews]
y_test = [label for _, label in test_reviews]

# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB() #Naive Bayes machine learning algorithm
nb_classifier.fit(X_train, y_train)  # Train the classifier using training data

# Predict using the trained Naive Bayes model - predict sentiment labels for the test data, generating an array of predictions stored in nb_predictions.
nb_predictions = nb_classifier.predict(X_test)  # Make predictions on the test data

# Calculate accuracy - Accuracy of the predictions by comparing them to the actual test labels (y_test).
accuracy = accuracy_score(y_test, nb_predictions)
print(f"Accuracy: {accuracy:.2f}")

# Generate Classification Report - detailed metrics about the performance of our classification model.
print("Classification Report for Naive Bayes Sentiment Analysis:\n")
print(classification_report(y_test, nb_predictions))

In [None]:
# POS-Tagging with NLTK

def print_top_sentiment_words(words_list, category_name, num_words=10):
    word_sentiment_scores = {word: nb_classifier.predict_proba(vectorizer.transform([word]))[0] for word in words_list if is_english_word(word)}

    positive_words = [(word, sentiment[1]) for word, sentiment in word_sentiment_scores.items() if sentiment[1] > sentiment[0]]
    negative_words = [(word, sentiment[0]) for word, sentiment in word_sentiment_scores.items() if sentiment[0] > sentiment[1]]

    positive_words.sort(key=lambda x: x[1], reverse=True)
    negative_words.sort(key=lambda x: x[1], reverse=True)

    print(f"Top {num_words} Positive {category_name}:")
    for word, sentiment in positive_words[:num_words]:
        print(f"{word}: {sentiment:.4f} sentiment")

    print(f"\nTop {num_words} Negative {category_name}:")
    for word, sentiment in negative_words[:num_words]:
        print(f"{word}: {sentiment:.4f} sentiment")


# Separate words based on their POS tags - NLTK POS
adjectives = [word for word in vocabulary if nltk.pos_tag([word])[0][1].startswith('JJ')]
verbs = [word for word in vocabulary if nltk.pos_tag([word])[0][1].startswith('VB')]
nouns = [word for word in vocabulary if nltk.pos_tag([word])[0][1].startswith('NN')]


# Print top sentiment words for each category
print_top_sentiment_words(adjectives, "Adjectives")
print()
print_top_sentiment_words(verbs, "Verbs")
print()
print_top_sentiment_words(nouns, "Nouns")


In [None]:
while True:
    user_input = input("Enter your text (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Exiting the program.")
        break

    # Convert the input sentence to a numerical feature vector
    user_input_vector = vectorizer.transform([user_input])

    # Predict sentiment probabilities using the trained Naive Bayes model
    sentiment_probabilities = nb_classifier.predict_proba(user_input_vector)[0]

    # Get the predicted sentiment class
    predicted_sentiment_class = nb_classifier.predict(user_input_vector)[0]

    if predicted_sentiment_class == 'pos':
        predicted_emotion = "positive"
    else:
        predicted_emotion = "negative"

    # Print the predicted sentiment and confidence for each class
    print(f"Predicted Sentiment: {predicted_emotion}")
    print(f"Positive Confidence: {sentiment_probabilities[1]:.4f}")
    print(f"Negative Confidence: {sentiment_probabilities[0]:.4f}")


# BERT - Distilbert-base-uncased

Data

In [None]:
# Importing the required libraries
%pip install nltk
%pip install transformers

import re
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('words')
nltk.download('movie_reviews')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import movie_reviews
from collections import defaultdict

In [None]:
# Define a function to check if a word is an English word
def is_english_word(word):
    return word.lower() in english_words

# Define a set of English words
english_words = set(words.words())

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Modify the preprocess_text function to use the WordNet Lemmatizer for all categories
def preprocess_text(text):
    # Remove non-alphanumeric characters (excluding spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenization#
    words = word_tokenize(text.lower())  # Convert to lowercase and tokenize

    # Lemmatize all words using NLTK's WordNet lemmatizer
    cleaned_words = []
    for word in words:
        lemma = lemmatizer.lemmatize(word)
        if lemma.isalpha() and is_english_word(lemma):
            cleaned_words.append(lemma)

    return ' '.join(cleaned_words)

# movie_reviews dataset
positive_reviews = movie_reviews.fileids('pos')
negative_reviews = movie_reviews.fileids('neg')

# Combine positive and negative reviews with preprocessing (including cleaning)
all_reviews = [(preprocess_text(movie_reviews.raw(fileid)), 'pos') for fileid in positive_reviews] + \
              [(preprocess_text(movie_reviews.raw(fileid)), 'neg') for fileid in negative_reviews]


In [None]:
# Load pre-trained BERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)


# Data preprocessing
positive_reviews = movie_reviews.fileids('pos')
negative_reviews = movie_reviews.fileids('neg')

all_reviews = [movie_reviews.raw(fileid) for fileid in positive_reviews] + \
              [movie_reviews.raw(fileid) for fileid in negative_reviews]
labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)  # 1 for positive, 0 for negative

# Tokenize and encode the reviews
encoded_reviews = tokenizer(all_reviews, padding=True, truncation=True, return_tensors='pt')

# Split the data into training and testing sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(encoded_reviews['input_ids'],
                                                                      torch.tensor(labels),
                                                                      test_size=0.2,
                                                                      random_state=42)

# Define a data loader for training
batch_size = 8
train_dataset = torch.utils.data.TensorDataset(train_inputs, train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Fine-tune BERT on the sentiment analysis task
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

num_epochs = 1  # Increase this for better performance
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss:.4f}")



In [None]:
# Evaluation on the test set
model.eval()

# Move test inputs and labels to the device
test_inputs, test_labels = test_inputs.to(device), test_labels.to(device)

# Initialize variables to store predictions
all_predicted_labels = []
batch_size = 8  # Adjust the batch size for inference

# Perform inference in batches to reduce GPU memory usage
with torch.no_grad():
    for i in range(0, len(test_inputs), batch_size):
        batch_inputs = test_inputs[i:i+batch_size]

        # Forward pass for the batch
        batch_outputs = model(batch_inputs)
        batch_logits = batch_outputs.logits

        # Convert logits to probabilities and get predicted labels
        batch_probs = torch.softmax(batch_logits, dim=1)
        batch_predicted_labels = torch.argmax(batch_probs, dim=1).cpu().numpy()

        # Append predicted labels for this batch to the list
        all_predicted_labels.extend(batch_predicted_labels)

# Convert the list of predicted labels to a numpy array
predicted_labels = np.array(all_predicted_labels)

# Calculate accuracy
accuracy = accuracy_score(test_labels.cpu().numpy(), predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# Generate Classification Report
class_names = ['negative', 'positive']
report = classification_report(test_labels.cpu().numpy(), predicted_labels, target_names=class_names)
print("Classification Report for BERT Sentiment Analysis:\n")
print(report)


In [None]:
import torch
import numpy as np

while True:
    user_input = input("Enter your text (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Exiting the program.")
        break

    # Tokenize and encode the user input
    user_input_tokens = tokenizer(user_input, padding=True, truncation=True, return_tensors='pt')
    user_input_encoded = {key: val.to(device) for key, val in user_input_tokens.items()}

    # Predict sentiment probabilities
    with torch.no_grad():
        user_output = model(**user_input_encoded)
        user_logits = user_output.logits
        user_probs = torch.softmax(user_logits, dim=1).cpu().numpy()

    # Modify the class names
    class_names = {0: "Negative", 1: "Positive"}

    # Find the sentiment class with the highest probability
    predicted_sentiment_idx = np.argmax(user_probs)
    predicted_sentiment = class_names[predicted_sentiment_idx].capitalize()

    # Print the predicted sentiment and sentiment probabilities
    print(f"Predicted Sentiment: {predicted_sentiment}")
    for class_idx, class_name in class_names.items():
        print(f"{class_name.capitalize()} Sentiment Probability: {user_probs[0][class_idx]:.4f}")


# Prototype Chatbot with Sentiments - BERT

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BlenderbotSmallForConditionalGeneration, BlenderbotSmallTokenizer

# Load DistilBERT for sentiment analysis
sentiment_model_name = "distilbert-base-uncased"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)

# Load BlenderBot chatbot
chatbot_model_name = "facebook/blenderbot_small-90M"
chatbot_tokenizer = BlenderbotSmallTokenizer.from_pretrained(chatbot_model_name)
chatbot_model = BlenderbotSmallForConditionalGeneration.from_pretrained(chatbot_model_name)

while True:
    user_input = input("Enter your text (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Exiting the program.")
        break

    # Sentiment analysis using DistilBERT
    user_input_tokens = sentiment_tokenizer(user_input, padding=True, truncation=True, return_tensors='pt')
    user_input_encoded = {key: val for key, val in user_input_tokens.items()}

    with torch.no_grad():
        user_output = sentiment_model(**user_input_encoded)
        user_logits = user_output.logits
        user_probs = torch.softmax(user_logits, dim=1).cpu().numpy()

    class_names = {0: "Negative", 1: "Positive"}
    predicted_sentiment_idx = np.argmax(user_probs)
    predicted_sentiment = class_names[predicted_sentiment_idx].capitalize()

    # Generate chatbot response based on sentiment
    chatbot_input = f"Predicted Sentiment: {predicted_sentiment}. {user_input}"
    chatbot_input_ids = chatbot_tokenizer.encode(chatbot_input, return_tensors="pt")

    chatbot_response_ids = chatbot_model.generate(chatbot_input_ids, max_length=50, num_return_sequences=1, pad_token_id=chatbot_tokenizer.eos_token_id)
    chatbot_response = chatbot_tokenizer.decode(chatbot_response_ids[0], skip_special_tokens=True)

    # Print the predicted sentiment, sentiment probabilities, and chatbot response
    print(f"Predicted Sentiment: {predicted_sentiment}")
    for class_idx, class_name in class_names.items():
        print(f"{class_name.capitalize()} Sentiment Probability: {user_probs[0][class_idx]:.4f}")
    print(f"Chatbot: {chatbot_response}")


# Requirements.txt

In [None]:
#pip freeze > requirements.txt


In [None]:
with open('requirements.txt', 'r') as file:
    print(file.read())
