In [172]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [173]:
#Import necessary libraries
import os
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [241]:
# Dataset path and loading
dataset_path = "/content/drive/MyDrive/TextModel_NaiveBayes/Dataset/training.csv"

# Check if the dataset exists
if os.path.exists(dataset_path):
    print(f"Dataset is located at: {dataset_path}")
else:
    print("Dataset folder not found.")

# Load the dataset
df = pd.read_csv(dataset_path)

Dataset is located at: /content/drive/MyDrive/TextModel_NaiveBayes/Dataset/training.csv


In [242]:
#Check for missing values
print(df.isnull().sum())  # Check for missing values in the dataset

text     0
label    0
dtype: int64


In [243]:
# Display the basic info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


In [244]:
# Dataset summary - number of characters, words, sentences
num_characters = len(df)
num_words = len(df)
num_sentences = len(df)

print(f"Number of characters: {num_characters}")
print(f"Number of words: {num_words}")
print(f"Number of sentences: {num_sentences}")

Number of characters: 16000
Number of words: 16000
Number of sentences: 16000


In [245]:
# Map numeric labels to string names
label_mapping = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear'}
unique_labels = df['label'].unique()
for label in unique_labels:
    print(f"{label}: {label_mapping.get(label, 'Unknown label')}")

0: sadness
3: anger
2: love
5: Unknown label
4: fear
1: joy


In [247]:
# Checking for lowercase conversion in the text columns
columns_checked = 0
columns_converted = 0
columns_not_converted = 0

# Check for lowercase conversion in the text columns
for column in df.select_dtypes(include=['object']):
    columns_checked += 1
    non_lowercase_rows = df[~df[column].apply(lambda x: x.islower() if isinstance(x, str) else True)]

    if not non_lowercase_rows.empty:
        columns_not_converted += 1
        print(f"Column '{column}' has non-lowercase values:")
        print(non_lowercase_rows[[column]])
    else:
        columns_converted += 1
        print(f"Column '{column}' is already in lowercase.")

Column 'text' is already in lowercase.


In [248]:
# Summary of column checks
print("\nSummary:")
print(f"Total columns checked: {columns_checked}")
print(f"Columns already in lowercase: {columns_converted}")
print(f"Columns with non-lowercase values: {columns_not_converted}")


Summary:
Total columns checked: 1
Columns already in lowercase: 1
Columns with non-lowercase values: 0


In [253]:
# Check if the dataset is already tokenized
are_tokenized = df['text'].apply(lambda x: isinstance(x, list)).all()

if are_tokenized:
    print("The dataset is tokenized.")
else:
    print("The dataset is not tokenized.")

The dataset is not tokenized.


In [254]:
# Define unwanted terms for preprocessing
unwanted_terms = ['aa', 'aaaaaaand', 'aaaaand', 'aaaand', 'aac', 'aahhh', 'ab', 'abc']
unwanted_terms_pattern = r'\b(' + '|'.join(map(re.escape, unwanted_terms)) + r')\b'

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text (tokenization and lemmatization without stop word removal)
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove unwanted terms based on the pattern
    cleaned_tokens = [
        word for word in tokens if not re.search(unwanted_terms_pattern, word.lower())
    ]

    # Lemmatize the cleaned tokens
    lemmatized_tokens = [
        lemmatizer.lemmatize(word.lower())  # Lemmatize and convert to lowercase
        for word in cleaned_tokens if word.isalpha()  # Only keep alphabetic tokens
    ]

    # Remove stop words after lemmatization
    final_tokens = [
        word for word in lemmatized_tokens if word not in stop_words
    ]

    return final_tokens

In [226]:
import nltk

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [255]:
# Apply preprocessing function to 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)

In [256]:
# Save processed data to CSV
output_file_path = "/content/drive/MyDrive/TextModel_NaiveBayes/Dataset/proccessed_text_dataOfTraining.csv"
df[['processed_text']].to_csv(output_file_path, index=False)
print(f"Processed data has been saved to: {output_file_path}")

Processed data has been saved to: /content/drive/MyDrive/TextModel_NaiveBayes/Dataset/proccessed_text_dataOfTraining.csv


In [257]:
# Load original and processed datasets
original_df = pd.read_csv("/content/drive/MyDrive/TextModel_NaiveBayes/Dataset/training.csv")
processed_df = pd.read_csv("/content/drive/MyDrive/TextModel_NaiveBayes/Dataset/proccessed_text_dataOfTraining.csv")

In [258]:
# Merge the 'label' column with 'processed_text' column
merged_df = pd.concat([original_df['label'], processed_df['processed_text']], axis=1)

# Save merged dataset
merged_file_path = "/content/drive/MyDrive/TextModel_NaiveBayes/Dataset/merged_text_data.csv"
merged_df.to_csv(merged_file_path, index=False)
print(f"Merged data has been saved to: {merged_file_path}")

Merged data has been saved to: /content/drive/MyDrive/TextModel_NaiveBayes/Dataset/merged_text_data.csv


In [259]:
# Remove rows with label 5
merged_df = merged_df[merged_df['label'] != 5]

In [260]:
#  Check rows after removing unknown label (if any)
print(f"Processed dataset row count: {merged_df.shape[0]}")

Processed dataset row count: 15428


In [262]:
# After removing rows with label '5'
final_df = merged_df[merged_df['label'] != 5]

# Save the filtered dataset (final_df) to a new file
mergedAndRemovedUnknownLabel_path = "/content/drive/MyDrive/TextModel_NaiveBayes/Dataset/finalDataset.csv"
final_df.to_csv(mergedAndRemovedUnknownLabel_path, index=False)

# Print a confirmation message
print(f"Filtered data (without label '5') has been saved to: {mergedAndRemovedUnknownLabel_path}")

Filtered data (without label '5') has been saved to: /content/drive/MyDrive/TextModel_NaiveBayes/Dataset/finalDataset.csv


In [263]:
# Dataset path
preproccessed_path = "/content/drive/MyDrive/TextModel_NaiveBayes/Dataset/finalDataset.csv"  # Adjust to your path

In [267]:
# Load the dataset
df = pd.read_csv(preproccessed_path)

In [268]:
# Prepare data for training (Text and Labels)
X_train = final_df['processed_text'].apply(lambda x: ' '.join(eval(x)) if pd.notna(x) else '')
y_train = final_df['label']

In [269]:
# Feature extraction using Bag of Words (BoW)
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
num_features_bow = len(bow_vectorizer.get_feature_names_out())
print(f"Number of unique features (terms) using Bag of Words: {num_features_bow}")

Number of unique features (terms) using Bag of Words: 13178


In [270]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
num_features_tfidf = len(tfidf_vectorizer.get_feature_names_out())
print(f"Number of unique features (terms) using TF-IDF: {num_features_tfidf}")

Number of unique features (terms) using TF-IDF: 13178


In [271]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

In [272]:
# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [273]:
# Evaluate Naive Bayes model
y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6821

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.92      0.76       898
           1       0.67      0.96      0.79      1110
           2       1.00      0.05      0.09       254
           3       0.94      0.25      0.39       449
           4       0.99      0.22      0.37       375

    accuracy                           0.68      3086
   macro avg       0.85      0.48      0.48      3086
weighted avg       0.77      0.68      0.62      3086


Confusion Matrix:
[[ 826   72    0    0    0]
 [  38 1071    0    0    1]
 [  62  180   12    0    0]
 [ 192  145    0  112    0]
 [ 147  137    0    7   84]]


In [276]:
# Hyperparameter tuning using GridSearchCV
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0], 'fit_prior': [True, False]}
grid_search = GridSearchCV(nb_model, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters found:  {'alpha': 0.5, 'fit_prior': False}


In [277]:
# Cross-validation
cv_scores = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

Cross-validation scores: [0.82665047 0.81611989 0.83387358 0.81401945 0.82495948]
Mean cross-validation score: 0.8231245729049943


In [279]:
# Save the model and vectorizer
joblib.dump(nb_model, "/content/drive/MyDrive/TextModel_NaiveBayes/naive_bayes_model.pkl")
joblib.dump(tfidf_vectorizer, "/content/drive/MyDrive/TextModel_NaiveBayes/tfidf_vectorizer.pkl")
print(f"Model and vectorizer have been saved to the respective paths.")

Model and vectorizer have been saved to the respective paths.


In [280]:
# Load the model and vectorizer (for later predictions)
loaded_model = joblib.load("/content/drive/MyDrive/TextModel_NaiveBayes/naive_bayes_model.pkl")
loaded_vectorizer = joblib.load("/content/drive/MyDrive/TextModel_NaiveBayes/tfidf_vectorizer.pkl")

In [281]:
# Predict on new data
new_data = ["I am so happy today!", "I feel really sad about this situation."]
new_data_processed = [preprocess_text(text) for text in new_data]
new_data_processed = [' '.join(text) for text in new_data_processed]
X_new_tfidf = loaded_vectorizer.transform(new_data_processed)
new_predictions = loaded_model.predict(X_new_tfidf)

In [282]:
# Output new predictions
predicted_labels = [label_mapping[label] for label in new_predictions]
print("Predicted labels:", predicted_labels)

Predicted labels: ['joy', 'sadness']
