# Import and Setup

In [1]:
import numpy as np 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore") 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('dataset/tweet_emotions.csv')
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


# EDA

### Removing ID, duplicate and Null Values

In [3]:
#If there is any null value throughout the row, remove it.
data = data.dropna()
data = data.reset_index(drop=True)
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


It was noticed that there are no null values present, therefore no values were dropped

In [4]:
# Remove the id column
data = data.drop(['tweet_id'], axis=1)
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [5]:
# Remove duplciate rows if they have the same "content" value. Also print the number of removed rows.
print("Number of duplicate rows before removing: ", data.duplicated().sum())
data = data.drop_duplicates(subset='content')
data = data.reset_index(drop=True)
print("Number of duplicate rows after removing: ", data.duplicated().sum())

Number of duplicate rows before removing:  91
Number of duplicate rows after removing:  0


### Cleaning Text

In [6]:
import re

# Function to clean text
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\W', ' ', text)
    #remove any @mentions
    text = re.sub(r'@\w+', '', text)
    #remove # from #hashtags
    text = re.sub(r'#', '', text)
    #remove any non-ascii characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    #remove any digits
    text = re.sub(r'\d', '', text)
    
    
    #Fix double or multiple spacing cause from removal
    text = re.sub(r'\s+', ' ', text)
    #Remove any leading or trailing spaces
    text = re.sub(r'^\s+|\s+?$', '', text)
    return text

# Apply the function to the "content" column
data['content'] = data['content'].apply(clean_text)

# Display the cleaned data
print(data.head())


    sentiment                                            content
0       empty  tiffanylue i know i was listenin to bad habit ...
1     sadness  Layin n bed with a headache ughhhh waitin on y...
2     sadness                     Funeral ceremony gloomy friday
3  enthusiasm                wants to hang out with friends SOON
4     neutral  dannycastillo We want to trade with someone wh...


Saving the cleaned data to a new file

In [7]:
data.to_csv('dataset/cleaned_tweet_emotions.csv', index=False)

In [8]:
df = pd.read_csv('dataset/cleaned_tweet_emotions.csv')
df.head()

Unnamed: 0,sentiment,content
0,empty,tiffanylue i know i was listenin to bad habit ...
1,sadness,Layin n bed with a headache ughhhh waitin on y...
2,sadness,Funeral ceremony gloomy friday
3,enthusiasm,wants to hang out with friends SOON
4,neutral,dannycastillo We want to trade with someone wh...


In [9]:
print(df.shape)
df = df.dropna()
df.isnull().sum()

(39827, 2)


sentiment    0
content      0
dtype: int64

In [10]:
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.sequence import pad_sequences

# Load Pre-trained GloVe Embeddings (200d)
def load_glove_embeddings(filepath):
    glove_dict = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_dict[word] = vector
    return glove_dict

# Convert tweets to TF-IDF weighted GloVe embeddings
def get_tweet_embedding(tweet, glove_dict, tfidf, feature_names):
    words = tweet.split()
    tweet_vector = np.zeros(200)  # GloVe 200d
    word_count = 0

    for word in words:
        if word in glove_dict and word in feature_names:
            weight = tfidf.get(word, 1)  # Default to 1 if word not in TF-IDF dict
            tweet_vector += weight * glove_dict[word]
            word_count += weight

    return tweet_vector / word_count if word_count != 0 else tweet_vector

# Load GloVe Embeddings
glove_path = "glove.twitter.27B.200d.txt"
glove_embeddings = load_glove_embeddings(glove_path)

# TF-IDF Vectorizer (Fitted on Your Dataset)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectorizer.fit(df['content'])
feature_names = set(tfidf_vectorizer.get_feature_names_out())

# Get TF-IDF Scores
idf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

# Convert Tweets to GloVe Embeddings
df['embedding'] = df['content'].apply(lambda x: get_tweet_embedding(x, glove_embeddings, idf_scores, feature_names))

# Check the output
print(df.head())

    sentiment                                            content  \
0       empty  tiffanylue i know i was listenin to bad habit ...   
1     sadness  Layin n bed with a headache ughhhh waitin on y...   
2     sadness                     Funeral ceremony gloomy friday   
3  enthusiasm                wants to hang out with friends SOON   
4     neutral  dannycastillo We want to trade with someone wh...   

                                           embedding  
0  [-0.009687531020362074, -0.11519767683321691, ...  
1  [0.02141141140250999, -0.05418356123417788, -0...  
2  [-0.042901569450643766, -0.05677819936404877, ...  
3  [0.03292089124552941, 0.22727344006666284, 0.1...  
4  [0.08450802176830757, 0.374923210176829, 0.003...  


In [28]:
print(df.shape)
print(df.dtypes)

(39826, 3)
sentiment    object
content      object
embedding    object
dtype: object


In [13]:
#Save the new data frame with the embeddings to a new csv file
df.to_csv('dataset/embedded_tweet_emotions.csv', index=False)

In [14]:
df = pd.read_csv('dataset/embedded_tweet_emotions.csv')
df.head()

Unnamed: 0,sentiment,content,embedding
0,empty,tiffanylue i know i was listenin to bad habit ...,[-9.68753102e-03 -1.15197677e-01 1.60114471e-...
1,sadness,Layin n bed with a headache ughhhh waitin on y...,[ 2.14114114e-02 -5.41835612e-02 -4.90891329e-...
2,sadness,Funeral ceremony gloomy friday,[-4.29015695e-02 -5.67781994e-02 6.10557315e-...
3,enthusiasm,wants to hang out with friends SOON,[ 3.29208912e-02 2.27273440e-01 1.04936072e-...
4,neutral,dannycastillo We want to trade with someone wh...,[ 8.45080218e-02 3.74923210e-01 3.75330375e-...


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the dataset into train (70%) and temp (30%)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Split the temp dataset into validation (20% of original) and test (10% of original)
val_df, test_df = train_test_split(temp_df, test_size=1/3, random_state=42)

# Print the sizes of the splits to verify
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Train set size: 27878
Validation set size: 7965
Test set size: 3983


In [16]:
train_df.shape

(27878, 3)