## 1. Data Collection and Exploration

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import sys
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
import xgboost as xgb


In [None]:
# importing libraries
nltk.download('punkt')

In [None]:
# load the dataset and preview first five rows
data = pd.read_csv('judge-1377884607_tweet_product_company.csv', encoding='ISO-8859-1')
data.head()

In [None]:
# getting information of the data
data.info()

In [None]:
# getting the shape of the data
data.shape

Analysis of the Distribution of emotion Labels

In [None]:
# Count the number of instances for each sentiment label
sentiment_counts = data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

# Display the counts
print(sentiment_counts)

plt.figure(figsize=(12, 6))
plt.bar(sentiment_counts.index, sentiment_counts.values)
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.title('Distribution of Sentiment Labels')
plt.show()


From the visualization, most people who tweeted dont have an emotion towards a brand or product and very few people we cant tell whether is positive, negative or have no emotion.

## 2. Data Preprocessing

### 2.1 Data Cleaning

- Dropping unwanted columns
- Handling missing values.
- Clean text data by removing special characters, URLs, and hashtags.
- Convert text to lowercase to ensure consistency.

2.1.1 Dropping unwanted columns

In [None]:
# dropping emotion_in_tweet_is_directed_at column since we wont be using it modelling
columns_to_drop = ['emotion_in_tweet_is_directed_at']
data = data.drop(columns=columns_to_drop)

In [None]:
# checking if the column has been dropped 
data.head()
# the emotion in tweet is directed at column has been dropped.

The column has been dropped and we remain with the two columns that we will be using henceforth

2.1.2 Handling mising values

In [None]:
# checking if our dataset has missing values
data.isna().sum()

In [None]:
# dropping the row with missing values
# since we cannot impute text
data = data.dropna(subset=['tweet_text'])


In [None]:
# checking if the row with missing values has been dropped
data.isna().sum()

2.1.3 Renaming columns

In [None]:
#renaming'is_there_an_emotion_directed_at_a_brand_or_product'column to emotion to make it easy to work with
data.rename(columns={'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'}, inplace=True)

In [None]:
# previewing the first five rows to check if the column has been renamed.
data.head()

In [None]:
# dropping 'i cant tell' and No emotion toward brand or product' category since we will only be using the two sentiments.
data = data[(data['emotion'] != "I can't tell")]


In [None]:
# Replacing 'No emotion toward brand or product' as neutral
#data['emotion'] = data['emotion'].replace({'No emotion toward brand or product': 'Neutral'})

In [None]:
# checking to see if 'i cant tell' category has been dropped and 'No emotion toward brand or product' has been replaced
data.emotion.value_counts()

2.1.3 Cleaning text data

In [None]:
# Function to clean text
def clean_text(text):
    # Ensure text is a string
    text = str(text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove hashtags (including the # symbol)
    text = re.sub(r'#\w+', '', text)
    
    # Remove special characters and punctuation (except spaces)
    text = re.sub(r'[^\w\s]', '', text)
    
     # Convert text to lowercase
    text = text.lower()
    
    return text

# Apply the clean_text function to the "tweet_text" column
data['cleaned_tweet'] = data['tweet_text'].apply(clean_text)

# Display the DataFrame with cleaned text
print(data)



2.2 Tokenization


In [None]:
# Tokenize the "tweet_text" column
data['tokenized_tweet'] = data['cleaned_tweet'].apply(lambda x: word_tokenize(x))

# Display the DataFrame with tokenized text
print(data[['cleaned_tweet', 'tokenized_tweet']].head())

2.3 Text Vectorization

In [None]:
# Join the tokenized words into a single string for each document
data['tokenized_tweet'] = data['tokenized_tweet'].apply(lambda x: ' '.join(x))

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform your tokenized text
tfidf_matrix = tfidf_vectorizer.fit_transform(data['tokenized_tweet'])


2.4 Label Encoding

To convert emotion column (negative, neutral, positive) into numerical format 0,1,2 respectively for model training using LabelEncoder()

In [None]:
# Define the criteria for 'Positive' and 'Negative'
#positive_criteria = data['emotion'] == 'Positive emotion'
#negative_criteria = data['emotion'] == 'Negative emotion'

# Create a new column 'binary_label' with initial values set to 'Neutral'
#data['binary_label'] = 'Neutral'

# Update 'binary_label' based on the criteria
#data.loc[positive_criteria, 'binary_label'] = 'Positive'
#data.loc[negative_criteria, 'binary_label'] = 'Negative'

# Now, the DataFrame should has a new column 'binary_label' with 'Positive' and 'Negative' labels 
# to use this column for binary classification


In [None]:
# Initialize the label encoder
#label_encoder = LabelEncoder()

# Apply label encoding to the 'emotion' column
#data['binary_label'] = label_encoder.fit_transform(data['emotion'])

# Display the DataFrame with the encoded emotion labels
#print(data[['emotion', 'binary_label']].head(10))


In [None]:
# Check unique values in the 'emotion' column
unique_emotions = data['emotion'].unique()
print("Unique Emotions:", unique_emotions)


## 3. Modelling-Binary Classification

- Data Filtering - filter data to only have rows with 'Positive emotion' or 'Negative emotion' labels, effectively creating a binary classification dataset.
- Splitting Data - split your filtered data into training and testing sets using the train_test_split function
- Label Encoding: You encode the 'emotion' labels ('Positive emotion' and 'Negative emotion') into numerical values ('Positive' as 1 and 'Negative' as 0) using LabelEncoder. 
- TF-IDF Vectorization-use TF-IDF vectorization to convert your text data into numerical vectors. The TfidfVectorizer is used to represent each tweet as a vector of TF-IDF features.
Naive Bayes Training-Initialize a Multinomial Naive Bayes classifier and train it on the TF-IDF vectors of the training data.
-Making Predictions-Use the trained Naive Bayes model to make predictions on the test set.
-Calculating Accuracy: You calculate the accuracy of the model's predictions on the test set using accuracy_score. Accuracy measures the proportion of correctly predicted instances.
- Classification Report: You display a classification report, which includes precision, recall, and F1-score for each class ('Positive' and 'Negative'). This report provides a detailed overview of the model's performance.
- Confusion Matrix-display a confusion matrix, which shows the number of true positive, true negative, false positive, and false negative predictions. This matrix is useful for assessing the model's performance on each class

3.2 Baseline Model using Naive Bayes

In [None]:
# Filter the data to include only 'Positive' and 'Negative' labels
filtered_data = data[(data['emotion'] == 'Positive emotion') | (data['emotion'] == 'Negative emotion')]

# Split the filtered data into features (X) and labels (y)
X = filtered_data['tokenized_tweet']  # Features
y = filtered_data['emotion']          # Labels (contains 'Positive' and 'Negative' classes)

# Encode labels to numerical values (Positive: 1, Negative: 0)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create TF-IDF vectors for text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = naive_bayes.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Naive Bayes Accuracy:", accuracy)

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


The model performs well getting an accuracy of 84.2%  and correctly identifying 'Positive' tweets with high precision and recall, resulting in a high F1-score for class 1. However, it struggles to identify 'Negative' tweets, as indicated by the very low recall for class 0. The overall accuracy is somewhat inflated due to the class imbalance, where 'Positive' tweets dominate the dataset. Improving recall for class 0 may be a priority if better identification of 'Negative' sentiment is necessary.

In [None]:
# Baseline model using Naive Bayes
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)
y_pred_nb = naive_bayes.predict(X_valid_tfidf)

# calculate accuracy
print("Naive Bayes Accuracy:", accuracy_score(y_valid, y_pred_nb))

# Calculate additional evaluation metrics
print("Classification Report:")
print(classification_report(y_valid, y_pred_nb))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_nb))

The provided evaluation  shows that a Naive Bayes classifier achieved an accuracy of approximately 83.3% on the binary classification task. While it performed well in correctly predicting class 1 with high precision and recall, it struggled to identify class 0 instances, resulting in a low recall for class 0. The F1-score, a balanced metric, indicates a good overall performance.

3.1.3 Hyperparameter Tuning for Improved Model Performance using GridSearch

In [None]:
# Define a range of hyperparameters to search
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0],  # Range of alpha values to test
    'fit_prior': [True, False],     # Whether to estimate class prior probabilities or not
}

# Create a GridSearchCV object with the Naive Bayes classifier
nb_classifier = MultinomialNB()
grid_search = GridSearchCV(nb_classifier, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to your training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Train a Naive Bayes model with the best hyperparameters
best_nb_classifier = MultinomialNB(alpha=best_params['alpha'], fit_prior=best_params['fit_prior'])
best_nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_best = best_nb_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Best Naive Bayes Accuracy:", accuracy_best)

# Display classification report for the tuned model
print("Classification Report for Tuned Model:")
print(classification_report(y_test, y_pred_best))

# Display confusion matrix for the tuned model
print("Confusion Matrix for Tuned Model:")
print(confusion_matrix(y_test, y_pred_best))


When using hyperparameter tuning, the Naive Bayes classifier achieved an improved accuracy of approximately 86.7% on a binary classification task. It shows better performance in correctly predicting class 1 with high precision and recall, resulting in a high F1-score for class 1. However, it still faces challenges in identifying class 0 instances, leading to lower precision, recall, and F1-score for class 0. The overall weighted average F1-score indicates a good overall performance, but the model's effectiveness in classifying class 0 remains a concern

Using Support Vector Machine(SVC)

In [None]:
# Filter the data to include only 'Positive' and 'Negative' labels
filtered_data = data[(data['emotion'] == 'Positive emotion') | (data['emotion'] == 'Negative emotion')]

# Split the filtered data into features (X) and labels (y)
X = filtered_data['tokenized_tweet']  # Features
y = filtered_data['emotion']          # Labels (contains 'Positive' and 'Negative' classes)

# Encode labels to numerical values (Positive: 1, Negative: 0)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create TF-IDF vectors for text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("SVM Accuracy:", accuracy)

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


A Support Vector Machine (SVM) classifier achieved an accuracy of approximately 86.2% on a binary classification task. The model exhibits good performance in correctly predicting class 1 with high precision and recall, resulting in a high F1-score for class 1. However, similar to the previous Naive Bayes model, it faces challenges in identifying class 0 instances, leading to lower precision, recall, and F1-score for class 0. The overall weighted average F1-score indicates a reasonably good overall performance.

The best binary classification model is Naive Bayes Model with Hyperparameter Tuning which achieved the highest accuracy of approximately 86.8% after hyperparameter tuning.
It's a strong candidate for this task because it's relatively simple, interpretable, and performs well on classifying sentiment.

### Multiclass classification

In [None]:
# Split the data into training and testing sets
X = data['tokenized_tweet']  # Features (tweet text)
y = data['emotion']      # Labels (sentiment classes)

# Encode labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create TF-IDF vectors for text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = naive_bayes.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Multiclass Naive Bayes Accuracy:", accuracy)

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


knn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Assuming you have your text data in 'X' and labels in 'y'
# Encode labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the vectorizer on the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize the KNN classifier
k = 5  # You can adjust the value of k
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Fit the classifier on the training data
knn_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = knn_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("KNN Accuracy:", accuracy)

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))




In [None]:
# Encode labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the vectorizer on the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize the Random Forest classifier
n_estimators = 100  # You can adjust the number of estimators (trees)
random_forest_classifier = RandomForestClassifier(n_estimators=n_estimators, random_state=42)

# Fit the classifier on the training data
random_forest_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = random_forest_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", accuracy)

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


The Random Forest classifier achieved an overall accuracy of approximately 0.69 in the multiclass text classification task. It performed well in identifying the neutral class (Label 1) with a high F1-Score of 0.78, indicating a good balance between precision and recall. However, its performance on the other classes (Label 0 and Label 2) was lower, particularly in terms of recall. To enhance model performance, strategies like data collection for minority classes and hyperparameter tuning could be explored.