In [1]:
import pandas as pd

In [2]:
# Load the dataset from a local CSV file after downloading
df = pd.read_csv("F:/Entry/nlp_dataset.csv")

In [3]:
print("Dataset Loaded")
print(df.head())

Dataset Loaded
                                             Comment Emotion
0  i seriously hate one subject to death but now ...    fear
1                 im so full of life i feel appalled   anger
2  i sit here to write i start to dig out my feel...    fear
3  ive been really angry with r and i feel like a...     joy
4  i feel suspicious if there is no one outside l...    fear


In [5]:
import re

In [10]:
# Function to clean text
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text

# Apply text cleaning
df['cleaned_text'] = df['Comment'].apply(clean_text)  # replace 'text_column' with the actual text column name

In [17]:
import re
from sklearn.feature_extraction.text import CountVectorizer

# Define a basic tokenizer function
def basic_tokenizer(text):
    # Split text by whitespace and remove any remaining punctuation
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

# Apply tokenizer
df['tokens'] = df['cleaned_text'].apply(basic_tokenizer)

# Remove stopwords with CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
df['filtered_tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in vectorizer.get_stop_words()])

print(df[['cleaned_text', 'tokens', 'filtered_tokens']].head())

                                        cleaned_text  \
0  i seriously hate one subject to death but now ...   
1                 im so full of life i feel appalled   
2  i sit here to write i start to dig out my feel...   
3  ive been really angry with r and i feel like a...   
4  i feel suspicious if there is no one outside l...   

                                              tokens  \
0  [i, seriously, hate, one, subject, to, death, ...   
1        [im, so, full, of, life, i, feel, appalled]   
2  [i, sit, here, to, write, i, start, to, dig, o...   
3  [ive, been, really, angry, with, r, and, i, fe...   
4  [i, feel, suspicious, if, there, is, no, one, ...   

                                     filtered_tokens  
0  [seriously, hate, subject, death, feel, reluct...  
1                         [im, life, feel, appalled]  
2  [sit, write, start, dig, feelings, think, afra...  
3  [ive, really, angry, r, feel, like, idiot, tru...  
4  [feel, suspicious, outside, like, rapture, hap..

In [18]:
#Here’s how to implement both methods, with an example using TfidfVectorizer:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [19]:
# Using TfidfVectorizer for feature extraction
vectorizer = TfidfVectorizer(max_features = 1000)
features = vectorizer.fit_transform(df['cleaned_text']).toarray()# converts to array


In [20]:
# Create a DataFrame with feature names
feature_df = pd.DataFrame(features, columns = vectorizer.get_feature_names_out())

In [21]:
print("Feature Matrix Shape:", feature_df.shape)
print(feature_df.head())

Feature Matrix Shape: (5937, 1000)
   able  about  above  absolutely    accept  acceptable  accepted  across  \
0   0.0    0.0    0.0         0.0  0.000000         0.0       0.0     0.0   
1   0.0    0.0    0.0         0.0  0.000000         0.0       0.0     0.0   
2   0.0    0.0    0.0         0.0  0.351753         0.0       0.0     0.0   
3   0.0    0.0    0.0         0.0  0.000000         0.0       0.0     0.0   
4   0.0    0.0    0.0         0.0  0.000000         0.0       0.0     0.0   

   act  actually  ...  wrote  year  years  yes  yesterday  yet  you  young  \
0  0.0       0.0  ...    0.0   0.0    0.0  0.0        0.0  0.0  0.0    0.0   
1  0.0       0.0  ...    0.0   0.0    0.0  0.0        0.0  0.0  0.0    0.0   
2  0.0       0.0  ...    0.0   0.0    0.0  0.0        0.0  0.0  0.0    0.0   
3  0.0       0.0  ...    0.0   0.0    0.0  0.0        0.0  0.0  0.0    0.0   
4  0.0       0.0  ...    0.0   0.0    0.0  0.0        0.0  0.0  0.0    0.0   

   your  youre  
0   0.0    0.0  

In [22]:
from sklearn.model_selection import train_test_split

In [25]:
# Define features (X) and target (y)
X = feature_df  # Feature matrix from TfidfVectorizer or CountVectorizer
y = df['Emotion']  # Replace 'label_column' with the actual column name for emotion labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [29]:
# Make predictions
y_pred_nb = nb_model.predict(X_test)

# Evaluate the model
print("Naive Bayes Model Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))

Naive Bayes Model Accuracy: 0.9116161616161617
Naive Bayes Classification Report:
               precision    recall  f1-score   support

       anger       0.90      0.93      0.91       392
        fear       0.91      0.90      0.91       416
         joy       0.93      0.90      0.91       380

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



In [30]:
from sklearn.svm import LinearSVC

# Initialize and train the SVM model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
print("SVM Model Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Model Accuracy: 0.946969696969697
SVM Classification Report:
               precision    recall  f1-score   support

       anger       0.94      0.94      0.94       392
        fear       0.97      0.93      0.95       416
         joy       0.93      0.97      0.95       380

    accuracy                           0.95      1188
   macro avg       0.95      0.95      0.95      1188
weighted avg       0.95      0.95      0.95      1188



In [31]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Naive Bayes Model Evaluation
nb_accuracy = accuracy_score(y_test, y_pred_nb)
nb_f1 = f1_score(y_test, y_pred_nb, average='weighted')  # weighted F1 for all classes
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes F1 Score:", nb_f1)

# SVM Model Evaluation
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm, average='weighted')
print("SVM Accuracy:", svm_accuracy)
print("SVM F1 Score:", svm_f1)

# Classification reports for detailed evaluation
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))
print("\nSVM Classification Report:\n", classification_report(y_test, y_pred_svm))

Naive Bayes Accuracy: 0.9116161616161617
Naive Bayes F1 Score: 0.9116193548331866
SVM Accuracy: 0.946969696969697
SVM F1 Score: 0.9469867123495583

Naive Bayes Classification Report:
               precision    recall  f1-score   support

       anger       0.90      0.93      0.91       392
        fear       0.91      0.90      0.91       416
         joy       0.93      0.90      0.91       380

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188


SVM Classification Report:
               precision    recall  f1-score   support

       anger       0.94      0.94      0.94       392
        fear       0.97      0.93      0.95       416
         joy       0.93      0.97      0.95       380

    accuracy                           0.95      1188
   macro avg       0.95      0.95      0.95      1188
weighted avg       0.95      0.95      0.95      1188



Brief Explanation and Model Suitability
1. Naive Bayes
How it Works: Naive Bayes uses Bayes’ theorem and assumes that features (words) are independent of each other given the class. Despite the independence assumption being simplistic, Naive Bayes performs well for text classification tasks.
Suitability: Naive Bayes is fast, handles high-dimensional data (like word frequencies) well, and is often highly effective for text data. It works best when features are independent and performs robustly even with smaller datasets.
2. Support Vector Machine (SVM)
How it Works: SVM constructs a hyperplane in a high-dimensional space to optimally separate classes. LinearSVC, in particular, is well-suited for large, sparse datasets such as those generated by TF-IDF or CountVectorizer.
Suitability: SVM is often more effective in handling complex boundaries between classes and can produce high accuracy and F1-scores. It’s generally more accurate than Naive Bayes when features have interdependencies, which may occur with nuanced language data in emotion classification.
Choosing the Best Model
Best Model Selection:
If Naive Bayes and SVM perform similarly in accuracy but SVM yields a higher F1-score, SVM would generally be the preferred model, as it better balances precision and recall across all classes.
However, if Naive Bayes performs competitively in both accuracy and F1-score, it might be chosen for its speed and simplicity, especially if the dataset size is very large.