Imported the necessary libraries.


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Loaded the datasets.


In [None]:
amazon_df = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
imdb_df = pd.read_csv('imdb_labelled.txt', sep='\t', header=None)
yelp_df = pd.read_csv('yelp_labelled.txt', sep='\t', header=None)


Merged the datasets into a single dataframe.


In [None]:
df = pd.concat([amazon_df, imdb_df, yelp_df])
df.columns = ['sentence', 'tag']


# Print thee first 5 rows of data
print(df.head())

                                            sentence  tag
0  So there is no way for me to plug it in here i...    0
1                        Good case, Excellent value.    1
2                             Great for the jawbone.    1
3  Tied to charger for conversations lasting more...    0
4                                  The mic is great.    1


Performed preprocessing on the text data including stopword removal, lower casing, stemming, lemmatization, and tokenization.

In [None]:
# Tokenization
df['sentence'] = df['sentence'].apply(lambda x: re.findall(r'\b\w+\b', x))

# Lowercase
df['sentence'] = df['sentence'].apply(lambda x: [word.lower() for word in x])

# Stopword removal
stop_words = stopwords.words('english')
df['sentence'] = df['sentence'].apply(lambda x: [word for word in x if word not in stop_words])

# Stemming
stemmer = PorterStemmer()
df['sentence'] = df['sentence'].apply(lambda x: [stemmer.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['sentence'] = df['sentence'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Print the first 5 rows of preprocessed data
print(df.head())


                                            sentence  tag
0                [way, plug, u, unless, go, convert]    0
1                          [good, case, excel, valu]    1
2                                    [great, jawbon]    1
3  [tie, charger, convers, last, 45, minut, major...    0
4                                       [mic, great]    1


Created a bag of words representation of the text data.


In [None]:
# Creating a vocabulary of unique words
vocab = set()
for sentence in df['sentence']:
    for word in sentence:
        vocab.add(word)

# Create a dictionary to hold the word counts
word_count = {word: [0] * len(df['sentence']) for word in vocab}

# Loop over each sentence and count the occurrences of each word
for i, sentence in enumerate(df['sentence']):
    for word in sentence:
        word_count[word][i] += 1

# Creating a dataframe of the word counts
word_count_df = pd.DataFrame(word_count)


# Print dataframe of the word count dimensions
print(" dataframe of the word count:", word_count_df.shape)

# Calculate the feature value of each word
N = len(df['sentence'])
word_count_df = word_count_df.apply(lambda x: np.log(1 + x) * np.log(N / (x > 0).sum()))

# Printing number of samples in the data set
print("Data Set Size:", len(df))



 dataframe of the word count: (2748, 3957)
Data Set Size: 2748


Split the data into training and test sets with a ratio of 80:20.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(word_count_df, df['tag'], test_size=0.2, random_state=42)


Define a Naive Bayes classifier and train it using the training data:

Defined a Naive Bayes classifier and trained it on the training data.


In [None]:


# Defining the classifier
nb_classifier = MultinomialNB()

# Training the classifier using the training data
nb_classifier.fit(X_train, y_train)


MultinomialNB()

Used the trained classifier to make predictions on the test data.


In [None]:
# Using the trained classifier to make predictions on the test data
y_pred = nb_classifier.predict(X_test)


Evaluated the performance of the classifier using accuracy score and classification report.

In [None]:


# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)
print("Accuracy Score:", acc_score)

print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Generating a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)


Accuracy Score: 0.7618181818181818
Confusion Matrix:
[[216  75]
 [ 56 203]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.74      0.77       291
           1       0.73      0.78      0.76       259

    accuracy                           0.76       550
   macro avg       0.76      0.76      0.76       550
weighted avg       0.76      0.76      0.76       550

