In [4]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd

# Step 1: Load the file into a DataFrame
file_path = '/content/drive/MyDrive/TextClassification/Preproccessing/Merged_Files/merged_output.csv'  # Replace with your actual file path

# If it's a CSV file, use pd.read_csv(). You can also use pd.read_excel(), pd.read_json(), etc.
final_df = pd.read_csv(file_path)

# Step 2: Count the number of rows (data points) in the DataFrame
num_rows = len(final_df)  # You can also use final_df.shape[0]
# Alternatively: num_rows = final_df.shape[0]

# Step 3: Print the result
print(f'Total number of rows: {num_rows}')


Total number of rows: 2076


In [6]:
# Prepare data for training (Text and Labels)
X = final_df['TEXT'].apply(lambda x: ' '.join(eval(x)) if pd.notna(x) else '')  # Joining tokenized sentences
y = final_df['EMOTION']

In [8]:
# Preprocess the text data
def preprocess_text(text):
    # Lowercase text
    text = text.lower()
    # Remove punctuation, numbers, etc.
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    return text

# Apply preprocessing to the training text
X = X.apply(preprocess_text)

In [9]:
# Feature extraction using Bag of Words (BoW)
from sklearn.feature_extraction.text import CountVectorizer # Import CountVectorizer from sklearn

# Feature extraction using Bag of Words (BoW)
bow_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_bow = bow_vectorizer.fit_transform(X)
num_features_bow = len(bow_vectorizer.get_feature_names_out())
print(f"Number of unique features (terms) using Bag of Words: {num_features_bow}")

Number of unique features (terms) using Bag of Words: 7399


In [11]:
# Feature extraction using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer # Import TfidfVectorizer from sklearn

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(X)
num_features_tfidf = len(tfidf_vectorizer.get_feature_names_out())
print(f"Number of unique features (terms) using TF-IDF: {num_features_tfidf}")

Number of unique features (terms) using TF-IDF: 7399


In [12]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split # Import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [13]:
# Train Naive Bayes model
from sklearn.naive_bayes import MultinomialNB # Import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_tfidf, y)

In [14]:
# Evaluate Naive Bayes model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Import necessary metrics

y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6971

Classification Report:
              precision    recall  f1-score   support

         ang       1.00      0.04      0.08        23
         exc       0.88      0.64      0.75        59
         fea       0.00      0.00      0.00         1
         fru       0.86      0.80      0.83        75
         hap       1.00      0.03      0.06        31
         neu       0.86      0.85      0.86        80
         sad       0.96      0.55      0.70        40
         sur       0.00      0.00      0.00         2
         xxx       0.50      0.95      0.66       105

    accuracy                           0.70       416
   macro avg       0.67      0.43      0.44       416
weighted avg       0.79      0.70      0.66       416


Confusion Matrix:
[[  1   0   0   2   0   2   0   0  18]
 [  0  38   0   2   0   2   0   0  17]
 [  0   0   0   0   0   0   0   0   1]
 [  0   0   0  60   0   1   1   0  13]
 [  0   4   0   1   1   2   0   0  23]
 [  0   0   0   2   0  68   0   0  10]
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# Hyperparameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV # Import GridSearchCV

param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0], 'fit_prior': [True, False]}
grid_search = GridSearchCV(nb_model, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_tfidf, y)
print("Best parameters found: ", grid_search.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters found:  {'alpha': 0.1, 'fit_prior': True}


In [16]:
# Use the best model from grid search for final evaluation
best_model = grid_search.best_estimator_

# Predict using the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Accuracy with best parameters: {accuracy_best:.4f}")
print("\nClassification Report for Best Model:")
print(classification_report(y_test, y_pred_best))
print("\nConfusion Matrix for Best Model:")
print(confusion_matrix(y_test, y_pred_best))

Accuracy with best parameters: 0.8702

Classification Report for Best Model:
              precision    recall  f1-score   support

         ang       0.94      0.74      0.83        23
         exc       0.92      0.83      0.88        59
         fea       0.00      0.00      0.00         1
         fru       0.94      0.85      0.90        75
         hap       0.96      0.81      0.88        31
         neu       0.88      0.95      0.92        80
         sad       0.89      0.82      0.86        40
         sur       0.00      0.00      0.00         2
         xxx       0.77      0.93      0.84       105

    accuracy                           0.87       416
   macro avg       0.70      0.66      0.68       416
weighted avg       0.87      0.87      0.87       416


Confusion Matrix for Best Model:
[[17  1  0  2  0  0  0  0  3]
 [ 0 49  0  1  0  2  0  0  7]
 [ 0  0  0  0  0  0  1  0  0]
 [ 1  0  0 64  1  3  1  0  5]
 [ 0  1  0  0 25  0  1  0  4]
 [ 0  0  0  1  0 76  0  0  3]
 [ 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

# Stratified K-Fold Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(grid_search.best_estimator_, X_tfidf, y, cv=cv)
print(f"Stratified Cross-validation scores: {cv_scores}")
print(f"Mean Stratified cross-validation score: {cv_scores.mean()}")

Stratified Cross-validation scores: [0.43028846 0.4        0.41927711 0.41204819 0.39759036]
Mean Stratified cross-validation score: 0.4118408248378128
