Load Necessary Libraries and Data

In [4]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import json


Load the cleaned data saved during the exploration step

In [2]:
# Load dataset
data_path = "../data/processed/cleaned_sentiment140.csv"
data = pd.read_csv(data_path)

# Check loaded data
data.head()


Unnamed: 0,target,id,date,flag,user,text,sentiment
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Negative
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,Negative
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,Negative
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,Negative
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",Negative


Define Cleaning Function

In [3]:
def clean_text(text, stem=False):
    """
    Cleans the input text by removing URLs, special characters, and stopwords.
    """
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    
    # Remove user mentions and hashtags
    text = re.sub(r"@\w+|#\w+", '', text)
    
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    stemmer = SnowballStemmer("english")
    filtered_words = []
    for word in text.split():
        if word not in stop_words:
            if stem:
                filtered_words.append(stemmer.stem(word))
            else:
                filtered_words.append(word)
    
    return ' '.join(filtered_words)


Apply Cleaning

Example Before and After

Original: "RT @user: Check out our new product! http://example.com"

Cleaned: "check new product"

In [4]:
from tqdm import tqdm
tqdm.pandas()

# Clean tweets
data['cleaned_text'] = data['text'].progress_apply(clean_text)

# Save the intermediate processed dataset
data.to_csv("../data/processed/cleaned_texts.csv", index=False)


100%|██████████| 1581466/1581466 [06:25<00:00, 4099.63it/s]


Tokenize cleaned text to prepare for embedding generation

In [6]:
# Tokenize each cleaned tweet
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['cleaned_text'])
data['tokens'] = tokenizer.texts_to_sequences(data['cleaned_text'])

# To ensure that all sequences are of the same length (necessary for many deep learning models), pad them
max_length = 300  # Set this based on the average tweet length in your dataset
data['padded_tokens'] = pad_sequences(data['tokens'], maxlen=max_length, padding='post').tolist()  # Pad sequences

# Save tokenized data
data.to_csv("../data/processed/tokenized_texts.csv", index=False)


**Word2Vec Embeddings**

Train Word2Vec Model

In [None]:
from gensim.models import Word2Vec

# Prepare list of tokenized sentences
tokenized_texts = data['padded_tokens'].tolist()

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=300, window=5, min_count=5, workers=4)

# Save the model
word2vec_model.save("../models/word2vec/word2vec.model")


Generate Embedding for Tweets: Average the embeddings of words in a tweet to get a tweet-level embedding:

In [11]:
# word2vec_model = Word2Vec.load('../models/word2vec/word2vec.model')
# data = pd.read_csv('../data/processed/tokenized_texts.csv')
def tweet_embedding(tokens, model):
    """
    Generates tweet-level embedding by averaging word embeddings.
    """
    valid_embeddings = [model.wv[word] for word in tokens if word in model.wv]
    if valid_embeddings:
        return np.mean(valid_embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Compute embeddings for all tweets
data['embedding'] = data['padded_tokens'].apply(lambda x: tweet_embedding(x, word2vec_model))

# Save tweet embeddings
data['embedding'] = data['embedding'].apply(lambda x: json.dumps(x.tolist()) if isinstance(x, np.ndarray) else x)
data[['embedding', 'target']].to_csv("../data/processed/tweet_embeddings.csv", index=False)
