In [1]:
!pip install datasets




In [2]:
# Importing libraries

from datasets import load_dataset # Importing the dataset through cloud 

stop_words = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'])

# load the dataset
dataset = load_dataset("wisesight_sentiment")

# define a function to preprocess the text data
def preprocess(text):
    # convert all text to lowercase
    text = text.lower()
    # remove all non-alphabetic characters
    text = ''.join(c for c in text if c.isalpha() or c.isspace())
    # split the text into tokens
    tokens = text.split()
    # remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

# split the dataset into training, validation, and test sets
def split_dataset(data, split_ratio):
    # Calculate the sizes of each dataset based on the split ratios
    train_size = int(len(data) * split_ratio[0])
    val_size = int(len(data) * split_ratio[1])
    test_size = int(len(data) * split_ratio[2])
    # Split the data into the three datasets based on their sizes
    train_data = data[:train_size] 
    val_data = data[train_size:train_size+val_size]
    test_data = data[train_size+val_size:train_size+val_size+test_size]
    # Return the three datasets as separate variables
    return train_data, val_data, test_data



# Define split_ratio for train, validation and test data
split_ratio = [0.7, 0.15, 0.15]

# Combine the 'texts' and 'category' fields of the training dataset using zip() function
# to create a list of tuples with each tuple containing the text and its corresponding category label
data = list(zip(dataset['train']['texts'], dataset['train']['category']))

# Split the data into train, validation and test sets based on the split_ratio
train_data, val_data, test_data = split_dataset(data, split_ratio)

# Unzip the train_data tuples to separate the 'texts' and 'category' fields into two separate lists
train_texts, labels_train = zip(*train_data)

# Calculate the prior probability of each class in the training set
# by counting the number of occurrences of each class and dividing by the total number of samples
# The prior probability gives the likelihood of a sample belonging to a particular class without any knowledge of its features
classes = set(labels_train)
class_count = {c: 0 for c in classes}
for label in labels_train:
    class_count[label] += 1
prior_prob = {c: count/len(labels_train) for c, count in class_count.items()}




# calculate the conditional probabilities of each word given each class
vocab = set()
word_count = {c: {} for c in classes}
for text, label in train_data:
    tokens = preprocess(text)
    for token in tokens:
        vocab.add(token)
        if token in word_count[label]:
            word_count[label][token] += 1
        else:
            word_count[label][token] = 1



cond_prob = {c: {} for c in classes}
for label in classes:
    total_words = sum(word_count[label].values())
    for word in vocab:
        if word in word_count[label]:
            count = word_count[label][word]
        else:
            count = 0
        cond_prob[label][word] = (count + 1) / (total_words + len(vocab))



# classify the validation and test sets

# This function classifies a given 'text' into one of the classes based on the maximum score calculated using Naive Bayes algorithm
# It takes in the preprocessed 'text', 'vocab' (vocabulary of words present in the training set), 'prior_prob' (prior probabilities of each class in the training set)
# and 'cond_prob' (conditional probabilities of each word given its class in the training set)

# First, the function tokenizes the given 'text' using preprocess() function

# Then, the function initializes a dictionary 'scores' with the prior probabilities of each class
# and iterates over each class and each token in the 'tokens' list
# If the token is present in the 'vocab', it multiplies the current score of the class by the conditional probability of the token given its class
# Finally, it returns the label of the class with the maximum score as the predicted class for the given 'text'

def classify(text, vocab, prior_prob, cond_prob):
    tokens = preprocess(text)
    scores = prior_prob.copy()
    for label in prior_prob:
        for token in tokens:
            if token in vocab:
                scores[label] *= cond_prob[label][token]
    return max(scores, key=scores.get)




# Create a list 'val_labels_true' and 'test_labels_true' to store the true labels of the validation and test set respectively
val_labels_true = [label for _, label in val_data]
test_labels_true = [label for _, label in test_data]




# Create a list 'val_labels_pred' and 'test_labels_pred' to store the predicted labels of the validation and test set respectively
# using the 'classify()' function defined earlier with the 'vocab', 'prior_prob', and 'cond_prob'
val_labels_pred = [classify(text, vocab, prior_prob, cond_prob) for text, _ in val_data]
test_labels_pred = [classify(text, vocab, prior_prob, cond_prob) for text, _ in test_data]




# Define the 'accuracy()' function to calculate the accuracy of the predicted labels
# It takes in the true labels 'labels_true' and predicted labels 'labels_pred' as inputs
# The function counts the number of correctly classified samples and returns the accuracy as the ratio of the number of correct predictions to the total number of predictions
def accuracy(labels_true, labels_pred):
    correct = sum(1 for true, pred in zip(labels_true, labels_pred) if true == pred)
    total = len(labels_true)
    return correct / total


# Define the 'precision()' function to calculate the precision of the predicted labels
# It takes in the true labels 'labels_true' and predicted labels 'labels_pred' as inputs
# The function counts the number of true positives (tp) and false positives (fp) and returns the precision as the ratio of tp to the sum of tp and fp
# Precision is a metric that evaluates how many of the predicted positives are actually true positives
def precision(labels_true, labels_pred):
    tp = 0
    fp = 0
    for i in range(len(labels_true)):
        if labels_pred[i] == 1 and labels_true[i] == 1:
            tp += 1
        elif labels_pred[i] == 1 and labels_true[i] == 0:
            fp += 1
    if tp + fp == 0:
        return 0
    else:
        return tp / (tp + fp)

def calculate_f1_score(predictions, targets):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    for pred, target in zip(predictions, targets):
        if pred == 1 and target == 1:
            true_positives += 1
        elif pred == 1 and target == 0:
            false_positives += 1
        elif pred == 0 and target == 1:
            false_negatives += 1
    
    precision = true_positives / (true_positives + false_positives + 1e-7)
    recall = true_positives / (true_positives + false_negatives + 1e-7)
    f1_score = 2 * (precision * recall) / (precision + recall + 1e-7)
    
    return f1_score, recall

# Print the accuracy and precision of the classifier on the validation and test sets using the 'accuracy()' and 'precision()' functions
# The predicted labels are stored in the lists 'val_labels_pred' and 'test_labels_pred' respectively
print("Validation Set:")
print("Accuracy:", accuracy(val_labels_true, val_labels_pred))
print("Precision:", precision(val_labels_true, val_labels_pred))
f1,recall=calculate_f1_score(val_labels_pred,val_labels_true)
print("F1 Score:", f1)
print("Recall:", recall)


print("\nTest Set:")
print("Accuracy:", accuracy(test_labels_true, test_labels_pred))
print("Precision:", precision(test_labels_true, test_labels_pred))
f1,recall=calculate_f1_score(test_labels_pred,test_labels_true)

print("F1 Score:", f1)
print("Recall:", recall)

# Print the actual and predicted labels of the test set for visual inspection
print()
print("Actual Values")
print(test_labels_true)
print()
print("Predicted values")
print(test_labels_pred)




Downloading metadata:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Downloading and preparing dataset wisesight_sentiment/wisesight_sentiment to C:/Users/harin/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/fc2b1bdfe79571b2e281e4afdb5aac069cf9270bf0f85694239be672a4191969...


Downloading data:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21628 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2404 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2671 [00:00<?, ? examples/s]

Dataset wisesight_sentiment downloaded and prepared to C:/Users/harin/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/fc2b1bdfe79571b2e281e4afdb5aac069cf9270bf0f85694239be672a4191969. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Validation Set:
Accuracy: 0.5681257706535142
Precision: 0.7690883850069412
F1 Score: 0.8600258238757578
Recall: 0.9753521126188173

Test Set:
Accuracy: 0.5635018495684341
Precision: 0.7655236329935126
F1 Score: 0.8588510034668845
Recall: 0.9780935464192958

Actual Values
[1, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 0, 1, 1, 0, 2, 0, 1, 1, 2, 0, 1, 1, 0, 1, 1, 1, 3, 1, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 2, 1, 1, 0, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 3, 0, 2, 2, 1, 2, 2, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 0, 1, 0, 0, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 0, 1, 3, 2, 1, 2, 2, 1, 1, 0, 2, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 2, 1, 1, 0, 0, 1, 2, 1, 1, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 0, 1, 0, 0, 3, 2, 3, 2, 2, 1, 2, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 1, 3, 0, 2, 3, 2, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 1