# Preprocess

In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vinceflores/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load DAtasets

In [3]:
pd.set_option('max_colwidth', 800)

TWEETS_PATH= "../data/raw_tweets_text.csv"
SENTIMENT_PATH="../data/t4sa_text_sentiment.tsv"

#load data
tweets_df = pd.read_csv(TWEETS_PATH, encoding='latin-1', header=0)
sentiment_df= pd.read_csv(SENTIMENT_PATH, sep='\t', header=0)

In [4]:
# Useful functions to help extract data from the columns

def extract_username_from_text(text):     # Extracts the username from retweets (e.g., "RT @user:").
    match = re.search(r'^RT @([^\s:]+):', text)
    return match.group(1) if match else None
     
def extract_links_from_text(text): # Extracts URLs from tweet text.
    urls = re.findall(r'https?://\S+', text)
    if not urls:
        return None
    return urls[0] if len(urls) == 1 else urls
           
def extract_hashtags_from_text(text): # Extract hashtags from the tweet text
    hashtags = re.findall(r'#\w+', text)
    if not hashtags:
        return None
    return hashtags[0] if len(hashtags) == 1 else hashtags
    
def extract_mentions_from_text(text): # Extract mentions from the tweet text
    cleaned_text = re.sub(r'^RT @[^\s:]+: ', '', text) # Remove the initial retweet username (e.g., "RT @user:")
    mentions = re.findall(r'@\w+', cleaned_text)
    if not mentions:
        return None
    return mentions[0] if len(mentions) == 1 else mentions



def clean_tweet_text(text: str):
    """
    Cleans the tweet text for EDA by removing noise such as:
    - Retweet prefixes (RT @user:)
    - URLs
    - HTML entities (e.g., &amp;)
    - Extra whitespace
    - Remove mentions
    - remove # symbol
    """
    # Remove retweet header
    text = re.sub(r'^RT @[^\s:]+: ', '', text)
    
    # Remove URLs
    # Remove URLs including malformed/truncated ones (e.g., "httpsâ")
    text = re.sub(r'https?\S+|www\.\S+', '', text)
    
    # Remove HTML entities like &amp;
    text = re.sub(r'&\w+;', '', text)
    
    # Remove extra spaces and trim
    text = re.sub(r'\s+', ' ', text).strip()

    #remove metions
    text = re.sub(r'@\w+', '', text)
    # remove hashtags
    # text = re.sub(r'#', '\w', text)
    text = text.replace("#","", -1)
    
    return text

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
# nltk.download('stowords')
def rm_stop_words(tweet):
    tokenized_tw = word_tokenize(tweet)
    stop_words = set(stopwords.words('english'))
    result = [t for t in tokenized_tw if t.lower() not in stop_words ]
    return " ".join(result)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/vinceflores/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
merged_df = pd.merge(tweets_df, sentiment_df, left_on='id', right_on='TWID')
merged_df = merged_df.drop(columns=['TWID']) # since its alr in id

main_df = merged_df.copy()
# Start adding extra columns that might help us with visualizations
# main_df['is_retweet'] = main_df['text'].str.startswith('RT ')
# main_df['username'] = main_df['text'].apply(extract_username_from_text)
# # main_df['urls'] = main_df['text'].apply(extract_links_from_text)
# main_df['hashtags'] = main_df['text'].apply(extract_hashtags_from_text)
# main_df['mentions'] = main_df['text'].apply(extract_mentions_from_text)

main_df['text'] = main_df['text'].apply(clean_tweet_text)
main_df['text'] = main_df['text'].apply(rm_stop_words)
classes=['NEG', 'POS', 'NEU']
main_df['class'] = main_df[classes].idxmax(axis=1)
cols_to_drop = ['id', 'NEG', 'POS', 'NEU'  ]
main_df = main_df.drop(columns=cols_to_drop, axis=1)
main_df


Unnamed: 0,text,class
0,Josh Jenkins looking forward TAB Breeders Crown Super Sunday,POS
1,[ Pic ] Nichkhun krjeong86 's IG,NEU
2,Congratulations Pakistan becoming No1TestTeam world odds ! JI_PakZindabadRallies,POS
3,"September , taking Maine Mendozaâs surprise thanksgiving party threw fans !",POS
4,Incredible India Atulya Bharat - Land Seekers BeProud ð ð®ð³ : | : Plz RT,NEU
...,...,...
1179952,morning girls wonderful Friday,POS
1179953,RT Follow Colin Kaepernick debated merits Castro'sâ¦ - Mercury News,NEU
1179954,live webcam find download app,NEU
1179955,Pearl Roadshow 4-piece Complete Drum Set Cymb,NEU


In [7]:
saved_path = "../data/processed_tweets_sentiment.csv"
main_df.to_csv(saved_path, index=False)

In [8]:
!pip install sentence-transformers



In [9]:
from sentence_transformers import SentenceTransformer
# Initialize the SBERT model
sbert = SentenceTransformer('bert-base-nli-mean-tokens')
sample_size = 5000
random_state = 33
neu = main_df[main_df['class'] == 'NEU'].sample(n=sample_size, random_state=random_state)
neg = main_df[main_df['class'] == 'NEG'].sample(n=sample_size, random_state=random_state)
pos = main_df[main_df['class'] == 'POS'].sample(n=sample_size, random_state=random_state)
# subset = main_df.head(1000)
subset = pd.concat([neu,neg, pos], ignore_index=True)

# tweet_embeddings = sbert.encode(subset['text'])


In [10]:
# subset['embeddings'] = tweet_embeddings.tolist()


In [11]:
# save_path="../project_data/t4sa_data.csv"
# subset.to_csv(save_path, index=False)

# Embedding Using Word2vec

In [12]:
! pip install gensim



In [13]:
dataset = main_df.loc[main_df['class'].isin(['NEG', 'NEU', 'POS'])]
dataset.columns

Index(['text', 'class'], dtype='object')

In [14]:

# tokenized_tw = word_tokenize(tweet)
dataset["tokenized_text"] = dataset["text"].apply(lambda x: word_tokenize(x))

In [15]:
# dataset["tokenized_text"]
dataset

Unnamed: 0,text,class,tokenized_text
0,Josh Jenkins looking forward TAB Breeders Crown Super Sunday,POS,"[Josh, Jenkins, looking, forward, TAB, Breeders, Crown, Super, Sunday]"
1,[ Pic ] Nichkhun krjeong86 's IG,NEU,"[[, Pic, ], Nichkhun, krjeong86, 's, IG]"
2,Congratulations Pakistan becoming No1TestTeam world odds ! JI_PakZindabadRallies,POS,"[Congratulations, Pakistan, becoming, No1TestTeam, world, odds, !, JI_PakZindabadRallies]"
3,"September , taking Maine Mendozaâs surprise thanksgiving party threw fans !",POS,"[September, ,, taking, Maine, Mendozaâs, surprise, thanksgiving, party, threw, fans, !]"
4,Incredible India Atulya Bharat - Land Seekers BeProud ð ð®ð³ : | : Plz RT,NEU,"[Incredible, India, Atulya, Bharat, -, Land, Seekers, BeProud, ð, ð®ð³, :, |, :, Plz, RT]"
...,...,...,...
1179952,morning girls wonderful Friday,POS,"[morning, girls, wonderful, Friday]"
1179953,RT Follow Colin Kaepernick debated merits Castro'sâ¦ - Mercury News,NEU,"[RT, Follow, Colin, Kaepernick, debated, merits, Castro'sâ¦, -, Mercury, News]"
1179954,live webcam find download app,NEU,"[live, webcam, find, download, app]"
1179955,Pearl Roadshow 4-piece Complete Drum Set Cymb,NEU,"[Pearl, Roadshow, 4-piece, Complete, Drum, Set, Cymb]"


In [16]:
# reference https://radimrehurek.com/gensim/models/word2vec.html
from gensim.test.utils import lee_corpus_list
from gensim.models import Word2Vec
sentences = dataset['tokenized_text']

vector_size=10
window=5
min_count=1
workers=4

w2v = Word2Vec(sentences,vector_size=vector_size, window=window, min_count=min_count, workers=workers, sg=1)

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

In [17]:
def create_w2v_embeddings(row, model):
    e = []
    for token in row:
        # e.append(np.mean( model.wv[token]))
        # e.append(np.mean())
        e.append( model.wv[token])
    return e
dataset['embeddings'] = dataset["tokenized_text"].apply(lambda row: create_w2v_embeddings(row, w2v))


In [18]:

dataset['embeddings_flatten'] = dataset["embeddings"].apply(lambda row: np.array(row).reshape(-1).tolist())

In [19]:
dataset.head()

Unnamed: 0,text,class,tokenized_text,embeddings,embeddings_flatten
0,Josh Jenkins looking forward TAB Breeders Crown Super Sunday,POS,"[Josh, Jenkins, looking, forward, TAB, Breeders, Crown, Super, Sunday]","[[-0.113853954, 0.61426413, 1.0451182, 0.893222, 1.2579755, 0.35145238, 1.4110141, 1.1207917, -1.3057975, -0.72572553], [-0.35403937, 1.0621666, 0.74526286, 0.7480261, 1.2924874, 0.7369891, 1.1482413, 0.80118775, -1.3500674, -0.44346103], [-0.2957207, -0.53411484, 0.6827601, -0.28330567, 1.731146, -0.55176055, 1.518203, 0.25636676, -1.2183276, -1.5224853], [0.051040474, -0.7260078, -0.1618401, 0.46670628, 1.2540244, -0.9407606, 2.0366619, 0.47104907, -1.8512024, -2.0404606], [-1.2794154, 0.39127317, 0.14349873, -0.98931277, 0.0033254607, 1.4170622, 0.5298591, 1.460339, -0.6791693, -1.0214216], [-0.25402245, -0.0016779394, 0.08299904, -0.16346505, 0.30869868, 0.54760236, 0.59448117, 0.46310663, -0.84121794, -0.5000733], [-0.16067526, 0.60375583, 0.3016045, -0.6323253, 0.99004763, 0.8514...","[-0.11385395377874374, 0.6142641305923462, 1.0451182126998901, 0.8932219743728638, 1.257975459098816, 0.35145238041877747, 1.4110140800476074, 1.1207916736602783, -1.3057974576950073, -0.725725531578064, -0.35403937101364136, 1.0621665716171265, 0.745262861251831, 0.7480260729789734, 1.292487382888794, 0.7369890809059143, 1.1482412815093994, 0.8011877536773682, -1.350067377090454, -0.44346103072166443, -0.2957206964492798, -0.5341148376464844, 0.6827601194381714, -0.28330567479133606, 1.731145977973938, -0.5517605543136597, 1.5182030200958252, 0.2563667595386505, -1.2183276414871216, -1.5224852561950684, 0.05104047432541847, -0.7260078191757202, -0.16184009611606598, 0.4667062759399414, 1.2540243864059448, -0.940760612487793, 2.0366618633270264, 0.47104907035827637, -1.851202368736267,..."
1,[ Pic ] Nichkhun krjeong86 's IG,NEU,"[[, Pic, ], Nichkhun, krjeong86, 's, IG]","[[-1.5693468, 2.049658, 1.137492, 0.08750274, 0.54676276, 0.63230544, 0.6729003, 0.655981, -1.3121158, -1.4043839], [-0.4361653, 1.1729405, 0.8765881, 0.028632604, 0.64229333, 0.8665044, 1.1770998, 0.4567495, -0.93769294, -0.7557075], [-1.6565623, 2.1175146, 1.2094496, 0.18033363, 0.53936005, 0.6481038, 0.6693481, 0.6759012, -1.2341661, -1.5299729], [-1.5265535, 1.191525, 0.9147903, 0.7891117, 0.8964215, 1.0092945, 0.59716886, 0.4279841, -0.52998984, -1.2941233], [-0.16759256, 0.009424157, -0.011900795, 0.041153755, 0.16186744, 0.25351366, -0.008999251, -0.025426328, -0.048780646, -0.15624174], [-0.7074431, 0.23060036, 1.0581964, -0.19098715, 1.2275993, -0.31913444, 1.7760433, 0.6697373, -1.1851416, -0.5484665], [-1.1426878, 1.2944506, 1.9087994, 0.3696698, 1.1195431, 0.5566722, 0.6645...","[-1.5693467855453491, 2.0496580600738525, 1.1374919414520264, 0.08750274032354355, 0.5467627644538879, 0.6323054432868958, 0.6729003190994263, 0.6559810042381287, -1.3121157884597778, -1.404383897781372, -0.43616530299186707, 1.1729404926300049, 0.8765881061553955, 0.028632603585720062, 0.6422933340072632, 0.8665043711662292, 1.1770998239517212, 0.45674949884414673, -0.9376929402351379, -0.7557075023651123, -1.656562328338623, 2.1175146102905273, 1.2094496488571167, 0.18033362925052643, 0.5393600463867188, 0.6481037735939026, 0.6693481206893921, 0.6759011745452881, -1.234166145324707, -1.5299729108810425, -1.5265535116195679, 1.1915249824523926, 0.9147902727127075, 0.7891116738319397, 0.896421492099762, 1.0092945098876953, 0.5971688628196716, 0.4279840886592865, -0.5299898386001587, -1..."
2,Congratulations Pakistan becoming No1TestTeam world odds ! JI_PakZindabadRallies,POS,"[Congratulations, Pakistan, becoming, No1TestTeam, world, odds, !, JI_PakZindabadRallies]","[[0.1317918, -0.14983346, 0.8323126, 0.60160947, 0.096248366, 0.20120004, 1.9938416, 0.5724737, -1.4372393, -1.0748318], [-0.61821544, -0.9514865, 1.4258574, 1.1346506, 0.83760864, -0.30233258, 0.6428912, 0.78054726, -2.247871, -0.23723312], [-0.5618646, -0.24005282, 0.8093832, 0.570356, 1.4414698, -0.87038577, 1.4082448, 0.7151696, -1.7549598, -0.96081567], [-0.20308416, -0.21988553, 0.23089968, 0.19098568, 0.120294034, 0.18269442, 0.19150297, 0.15784732, -0.32372716, -0.33736727], [-0.5430941, -0.6969517, 1.4222604, 0.0020438214, 1.4931902, -0.50268584, 1.5362289, 0.1775604, -1.6475046, -0.72135514], [-0.13242017, -0.44478917, 1.1298097, 0.1410925, -0.051907368, -0.4613592, 0.114695534, 1.8434645, -2.2045116, -1.8791926], [-0.9816768, -0.011558109, 1.5081965, -0.84687155, 0.39234146,...","[0.13179180026054382, -0.14983345568180084, 0.8323125839233398, 0.601609468460083, 0.09624836593866348, 0.20120003819465637, 1.9938416481018066, 0.5724737048149109, -1.4372392892837524, -1.0748318433761597, -0.6182154417037964, -0.9514865279197693, 1.425857424736023, 1.1346505880355835, 0.8376086354255676, -0.3023325800895691, 0.6428912281990051, 0.7805472612380981, -2.247870922088623, -0.23723311722278595, -0.5618646144866943, -0.24005281925201416, 0.80938321352005, 0.570356011390686, 1.4414697885513306, -0.8703857660293579, 1.4082448482513428, 0.7151696085929871, -1.7549598217010498, -0.9608156681060791, -0.20308415591716766, -0.21988552808761597, 0.23089967668056488, 0.19098567962646484, 0.12029403448104858, 0.1826944202184677, 0.19150297343730927, 0.1578473150730133, -0.32372716069..."
3,"September , taking Maine Mendozaâs surprise thanksgiving party threw fans !",POS,"[September, ,, taking, Maine, Mendozaâs, surprise, thanksgiving, party, threw, fans, !]","[[-0.0049647824, 0.6794055, 1.3684819, -0.16022235, 0.045573995, 0.20250203, 1.3106245, 1.185169, -2.2042248, -0.9977927], [-0.6456102, 0.08905081, 1.4265618, -0.68143284, 0.85505027, -0.114314064, 1.467009, 0.34711474, -1.4367783, -0.19236237], [-0.3329712, -0.25819528, 1.2613918, -0.07140509, 1.650897, -0.48520967, 1.1769776, 0.39086235, -1.4905738, -1.0660741], [-0.046635196, -0.08582323, 0.8724152, 0.33013925, 0.829082, 0.37980345, 1.7311515, -0.07346158, -1.7308449, -0.17906837], [-0.19368845, -0.033638056, 0.094482854, -0.023936411, 0.063258104, 0.14134964, 0.08507232, 0.019524649, -0.29701287, -0.22392671], [-0.10218446, -0.2608329, 1.2447171, -0.26831186, 1.3140522, -0.20157662, 1.6495348, 0.20646363, -1.3288006, -1.3332254], [-0.38250965, -1.2547939, 0.9450447, 0.05225152, 0.9...","[-0.004964782390743494, 0.6794055104255676, 1.3684818744659424, -0.1602223515510559, 0.045573994517326355, 0.2025020271539688, 1.3106244802474976, 1.185168981552124, -2.2042248249053955, -0.9977927207946777, -0.6456102132797241, 0.08905080705881119, 1.4265618324279785, -0.681432843208313, 0.855050265789032, -0.11431406438350677, 1.4670089483261108, 0.3471147418022156, -1.4367783069610596, -0.19236236810684204, -0.3329711854457855, -0.25819528102874756, 1.2613917589187622, -0.0714050903916359, 1.6508970260620117, -0.4852096736431122, 1.1769776344299316, 0.3908623456954956, -1.490573763847351, -1.0660741329193115, -0.04663519561290741, -0.08582323044538498, 0.8724151849746704, 0.33013924956321716, 0.8290820121765137, 0.37980344891548157, 1.7311514616012573, -0.07346157729625702, -1.73084..."
4,Incredible India Atulya Bharat - Land Seekers BeProud ð ð®ð³ : | : Plz RT,NEU,"[Incredible, India, Atulya, Bharat, -, Land, Seekers, BeProud, ð, ð®ð³, :, |, :, Plz, RT]","[[-0.560966, 0.81614155, 0.48802778, -0.451831, 1.1719334, -0.27217785, 1.3391999, -0.061624773, -1.7772133, -0.6611051], [-0.5550094, -0.27038404, 0.9148904, 0.09105802, 0.8292549, -0.4214089, 0.42687744, 0.87987673, -2.370926, -0.16857876], [-0.2705117, -0.008119681, 0.042114966, -0.040911704, 0.043512832, 0.11074387, -0.028262112, 0.09549509, -0.20002747, -0.09689449], [-0.569798, -0.2648929, 0.63300157, 0.61078155, 0.25789845, 0.09771101, 0.7963846, 0.2933108, -1.3764836, -0.33539733], [-0.7432041, 0.935155, 1.038433, -0.8375927, 0.5833445, 0.0007826227, 0.84535325, 0.7457703, -1.9910955, -0.333183], [-0.4035647, 0.7596336, 0.35174456, -0.39126003, 1.0375931, 0.35717884, 1.0176176, 0.5628632, -2.2903967, 0.065817565], [-0.56045014, 0.2545408, 0.0787897, -0.115728095, 0.06630278, 0....","[-0.5609660148620605, 0.8161415457725525, 0.48802778124809265, -0.45183101296424866, 1.1719334125518799, -0.2721778452396393, 1.3391999006271362, -0.06162477284669876, -1.7772133350372314, -0.6611050963401794, -0.5550094246864319, -0.2703840434551239, 0.9148904085159302, 0.09105802327394485, 0.8292549252510071, -0.42140889167785645, 0.4268774390220642, 0.8798767328262329, -2.3709259033203125, -0.16857875883579254, -0.2705116868019104, -0.008119680918753147, 0.04211496561765671, -0.040911704301834106, 0.04351283237338066, 0.11074387282133102, -0.02826211228966713, 0.09549508988857269, -0.2000274658203125, -0.09689448773860931, -0.5697979927062988, -0.26489290595054626, 0.6330015659332275, 0.6107815504074097, 0.2578984498977661, 0.09771101176738739, 0.7963845729827881, 0.2933107912540436..."


In [21]:

save_path="../project_data/t4sa_data_w2v_sg.csv"
subset.to_csv(save_path, index=False)