# Preprocess

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk


## Load DAtasets

In [14]:
pd.set_option('max_colwidth', 800)

TWEETS_PATH= "../data/raw_tweets_text.csv"
SENTIMENT_PATH="../data/t4sa_text_sentiment.tsv"

#load data
tweets_df = pd.read_csv(TWEETS_PATH, encoding='latin-1', header=0)
sentiment_df= pd.read_csv(SENTIMENT_PATH, sep='\t', header=0)

In [None]:
# Useful functions to help extract data from the columns

def extract_username_from_text(text):     # Extracts the username from retweets (e.g., "RT @user:").
    match = re.search(r'^RT @([^\s:]+):', text)
    return match.group(1) if match else None
     
def extract_links_from_text(text): # Extracts URLs from tweet text.
    urls = re.findall(r'https?://\S+', text)
    if not urls:
        return None
    return urls[0] if len(urls) == 1 else urls
           
def extract_hashtags_from_text(text): # Extract hashtags from the tweet text
    hashtags = re.findall(r'#\w+', text)
    if not hashtags:
        return None
    return hashtags[0] if len(hashtags) == 1 else hashtags
    
def extract_mentions_from_text(text): # Extract mentions from the tweet text
    cleaned_text = re.sub(r'^RT @[^\s:]+: ', '', text) # Remove the initial retweet username (e.g., "RT @user:")
    mentions = re.findall(r'@\w+', cleaned_text)
    if not mentions:
        return None
    return mentions[0] if len(mentions) == 1 else mentions



def clean_tweet_text(text: str):
    """
    Cleans the tweet text for EDA by removing noise such as:
    - Retweet prefixes (RT @user:)
    - URLs
    - HTML entities (e.g., &amp;)
    - Extra whitespace
    - Remove mentions
    - remove # symbol
    """
    # Remove retweet header
    text = re.sub(r'^RT @[^\s:]+: ', '', text)
    
    # Remove URLs
    # Remove URLs including malformed/truncated ones (e.g., "httpsâ")
    text = re.sub(r'https?\S+|www\.\S+', '', text)
    
    # Remove HTML entities like &amp;
    text = re.sub(r'&\w+;', '', text)
    
    # Remove extra spaces and trim
    text = re.sub(r'\s+', ' ', text).strip()

    #remove metions
    text = re.sub(r'@\w+', '', text)
    # remove hashtags
    # text = re.sub(r'#', '\w', text)
    text = text.replace("#","", -1)
    
    return text

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
# nltk.download('stowords')
def rm_stop_words(tweet):
    tokenized_tw = word_tokenize(tweet)
    stop_words = set(stopwords.words('english'))
    result = [t for t in tokenized_tw if t.lower() not in stop_words ]
    return " ".join(result)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/vinceflores/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
merged_df = pd.merge(tweets_df, sentiment_df, left_on='id', right_on='TWID')
merged_df = merged_df.drop(columns=['TWID']) # since its alr in id

main_df = merged_df.copy()
# Start adding extra columns that might help us with visualizations
# main_df['is_retweet'] = main_df['text'].str.startswith('RT ')
# main_df['username'] = main_df['text'].apply(extract_username_from_text)
# # main_df['urls'] = main_df['text'].apply(extract_links_from_text)
# main_df['hashtags'] = main_df['text'].apply(extract_hashtags_from_text)
# main_df['mentions'] = main_df['text'].apply(extract_mentions_from_text)

main_df['text'] = main_df['text'].apply(clean_tweet_text)
main_df['text'] = main_df['text'].apply(rm_stop_words)
classes=['NEG', 'POS', 'NEU']
main_df['class'] = main_df[classes].idxmax(axis=1)
cols_to_drop = ['id', 'NEG', 'POS', 'NEU'  ]
main_df = main_df.drop(columns=cols_to_drop, axis=1)
main_df


Unnamed: 0,text,class
0,Josh Jenkins looking forward TAB Breeders Crown Super Sunday,POS
1,[ Pic ] Nichkhun krjeong86 's IG,NEU
2,Congratulations Pakistan becoming No1TestTeam world odds ! JI_PakZindabadRallies,POS
3,"September , taking Maine Mendozaâs surprise thanksgiving party threw fans !",POS
4,Incredible India Atulya Bharat - Land Seekers BeProud ð ð®ð³ : | : Plz RT,NEU
...,...,...
1179952,morning girls wonderful Friday,POS
1179953,RT Follow Colin Kaepernick debated merits Castro'sâ¦ - Mercury News,NEU
1179954,live webcam find download app,NEU
1179955,Pearl Roadshow 4-piece Complete Drum Set Cymb,NEU


In [68]:
!pip install sentence-transformers

10176.01s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.1.5-py3-none-any.whl.metadata (13 kB)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl.metadata (4.1 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl.metadata (4.9 kB)
Downloading sentence_transformers-5.1.

In [125]:
from sentence_transformers import SentenceTransformer
# Initialize the SBERT model
sbert = SentenceTransformer('bert-base-nli-mean-tokens')
sample_size = 1000
random_state = 33
neu = main_df[main_df['class'] == 'NEU'].sample(n=sample_size, random_state=random_state)
neg = main_df[main_df['class'] == 'NEG'].sample(n=sample_size, random_state=random_state)
pos = main_df[main_df['class'] == 'POS'].sample(n=sample_size, random_state=random_state)
# subset = main_df.head(1000)
subset = pd.concat([neu,neg, pos], ignore_index=True)

tweet_embeddings = sbert.encode(subset['text'])


In [123]:
subset['embeddings'] = tweet_embeddings.tolist()

In [124]:
subset

Unnamed: 0,text,class,embeddings
0,* * Re-tweet * * Art Photograph Reflections Past * * Re-tweet * * Click Link * *,NEU,"[0.729671061038971, 0.15178868174552917, 1.3483428955078125, 0.0994597002863884, 0.037863604724407196, -0.22710582613945007, -0.06202438846230507, 0.006082975305616856, -0.34925809502601624, -0.14914612472057343, 0.04790185019373894, -0.1282377988100052, 0.30615028738975525, 0.601817786693573, -0.03940784931182861, -0.11016742885112762, -0.9297025799751282, -0.09370468556880951, 0.30638688802719116, -0.6500803828239441, 0.459208220243454, -0.23195886611938477, -0.3554755747318268, -0.18342597782611847, 0.35728517174720764, -0.1648208349943161, 0.14454443752765656, -0.3978077471256256, -0.23921789228916168, -0.156537726521492, -0.6162621378898621, -0.5433502197265625, 0.5127397179603577, -0.11706546694040298, 0.7164474129676819, 0.049456071108579636, -0.19131846725940704, 0.059139054268..."
1,"1 4 shoppers read reviews right get acompanyâs site , start shop .",NEU,"[0.31316033005714417, 0.9355886578559875, 1.811059832572937, -0.0008475372451357543, -0.0024718360509723425, -0.22529788315296173, 0.16343964636325836, 0.5979213714599609, -0.1781509518623352, 0.4455304443836212, -0.04625540226697922, 1.314293384552002, 0.7153114676475525, 0.14328411221504211, -1.1299201250076294, 0.5169329643249512, -1.508275032043457, -0.5420059561729431, 0.2108854204416275, -0.6761074066162109, 0.004411423113197088, -1.0987106561660767, 0.3488311469554901, 0.5437254309654236, 0.5527242422103882, 0.7146580815315247, -0.5948919057846069, 0.020608695223927498, -0.24657176434993744, 0.23023821413516998, -1.2043803930282593, -0.33793899416923523, 1.3699432611465454, -1.0291965007781982, -0.2902086675167084, -0.24756750464439392, 0.5212261080741882, 0.6448280811309814, -0..."
2,PLAY Bingo - SPEND GBP10 GET PLAY GBP50 BGO -,NEU,"[-0.27733805775642395, 0.725353479385376, 0.46870073676109314, 0.438396692276001, 0.475509911775589, -0.8219553828239441, -0.7946497201919556, 0.4524596035480499, -0.20052747428417206, -0.1115824282169342, -0.4415002763271332, 0.9775832891464233, -0.018683746457099915, 0.5961475372314453, 0.2695848047733307, 0.26404693722724915, -0.1406232863664627, -0.0992400050163269, -0.11872755736112595, -0.7508854866027832, -0.060453563928604126, -0.16635456681251526, -0.5521876811981201, 0.08015000820159912, 1.2604644298553467, 0.3376392424106598, -0.059057798236608505, 0.08906716108322144, -0.8333666324615479, 0.47221043705940247, -0.30140602588653564, 1.4166942834854126, 0.581011176109314, -0.1550302803516388, 0.10775046795606613, 0.8077969551086426, -0.1162092387676239, -0.1923362761735916, 0...."
3,MARVELS Graphic Novel Autographed Signed Alex Ross + Kurt Busiek Marvel Comics,NEU,"[-0.534769594669342, 0.4708235263824463, 0.013562447391450405, 0.5580952167510986, 0.1135101318359375, -0.5522306561470032, 0.13528048992156982, -0.012637809850275517, -0.005910450126975775, -0.23380184173583984, -0.2689996659755707, 0.47119542956352234, 0.7456592321395874, -0.21907278895378113, -0.31222066283226013, 0.730396568775177, -1.494097113609314, 0.11814365535974503, -0.34429553151130676, -0.675474226474762, -0.04402821138501167, -1.131463646888733, 0.15761123597621918, 0.17813444137573242, 0.34508374333381653, 0.4596349895000458, 0.5057926177978516, -1.0451862812042236, -0.540762722492218, 0.7383738160133362, -0.45023486018180847, 0.025981580838561058, 0.04160591959953308, -1.2908453941345215, -0.3092661201953888, -0.14976847171783447, -0.5414276719093323, -0.2121934145689010..."
4,Ariana 's comment fans Instagram post :,NEU,"[0.7048339247703552, 0.252581924200058, 0.8728340268135071, -0.16289129853248596, -0.1855814903974533, -0.48806169629096985, 0.7275425791740417, 0.40978121757507324, 0.2746039032936096, -0.27360284328460693, -0.05584840103983879, -0.29790255427360535, 0.5827365517616272, 0.7679235339164734, -0.2464531660079956, -0.2675718069076538, -0.8261420130729675, 0.5112200379371643, 0.07927092164754868, -0.2949988543987274, 0.09279215335845947, -1.0026754140853882, 0.2955062985420227, -0.738278865814209, 0.5498116612434387, 0.630147397518158, 0.21141718327999115, -0.9092662930488586, -0.20964877307415009, 0.6261181831359863, 0.16681551933288574, 0.5172095894813538, -0.051637861877679825, 0.22263085842132568, 0.06139824911952019, -0.7183423042297363, 0.039866458624601364, 0.5479926466941833, -0.35..."
...,...,...,...
1495,'re cheering teammates good luck guys R.O.T . Trackandfield yusra mardini,POS,"[0.0239962600171566, 0.5832062363624573, 0.6376453042030334, 0.24164819717407227, -0.343766450881958, -0.7616662383079529, -0.16760662198066711, 0.16219425201416016, -0.04072095453739166, -0.3320407271385193, -0.18994314968585968, 1.1728118658065796, 0.3830030560493469, -0.40790027379989624, 0.2764773964881897, 0.36870190501213074, -0.007355520036071539, -0.7537166476249695, 0.5060945749282837, -0.4278671443462372, -0.9029951691627502, -0.4958585798740387, -0.4137243330478668, -0.6118894815444946, 0.33965468406677246, 1.619940996170044, -0.13505364954471588, -0.48273640871047974, -1.3423954248428345, 0.512275755405426, 0.21190014481544495, 1.3279297351837158, -0.13167992234230042, -0.3976321816444397, -0.17150461673736572, 0.3615413308143616, 1.4076515436172485, 0.2347143441438675, -0...."
1496,160825 EXO 1 MCountdown week ! Congrats EXO ! â¡ Lotto1stWin,POS,"[-0.18632251024246216, 0.567457377910614, 1.7480762004852295, 0.4879283607006073, 0.5367406606674194, -0.7838255167007446, -0.641719400882721, 1.2182743549346924, -0.5440629720687866, -0.11342354863882065, -0.5214548110961914, 0.6217606067657471, 0.3454342484474182, 0.4761921763420105, -0.11905264109373093, 0.3039974570274353, -0.7550141215324402, -0.23931950330734253, -0.3962486684322357, -1.117920994758606, -0.3088493347167969, -0.651213526725769, 0.4360089600086212, 0.0031076830346137285, 1.0071393251419067, -0.028861135244369507, -0.0762903243303299, -1.4845566749572754, -0.7485458850860596, 0.416029155254364, -0.47380560636520386, 0.5549128651618958, 0.34818553924560547, 0.11233588308095932, 0.3049173653125763, 0.9789987802505493, -0.8748305439949036, -0.12302206456661224, 0.32705..."
1497,'ve never excited open package knew contents,POS,"[0.6026562452316284, 0.7727192640304565, 0.8528310656547546, 0.24851925671100616, -0.13892638683319092, -0.3211049437522888, 1.515639305114746, 1.1169919967651367, -0.3538439869880676, 0.535138726234436, -0.8599181175231934, 0.026630690321326256, 0.585663914680481, 0.5848628878593445, 1.0441557168960571, 0.4172546863555908, -0.109222911298275, -0.5336502194404602, 0.30033665895462036, -0.7638596296310425, -0.7845579385757446, -0.07900182157754898, 0.44594088196754456, -1.2003982067108154, -0.1422732174396515, -0.3431224822998047, -0.2301710844039917, -1.3682094812393188, -0.9743009805679321, 0.5459522008895874, -0.21160557866096497, 0.680159866809845, 0.8326443433761597, -0.6040916442871094, 0.40289002656936646, 0.20216694474220276, -0.18229810893535614, -0.570885956287384, -0.43641680..."
1498,love anime really YuriOnIce ð­ð­ð­ððððð .. drawing know perfect ð « ð ' (,POS,"[-0.17124629020690918, 0.10130726546049118, 1.5728237628936768, 0.1499115526676178, -0.481702595949173, -1.3877036571502686, 0.27929067611694336, 0.072350412607193, 0.8858025074005127, -0.2937697768211365, -0.1539372354745865, 0.9570947885513306, 0.5834069848060608, 0.7796652913093567, 0.048101820051670074, 0.3517966866493225, -0.4278831481933594, -0.28051865100860596, 0.42605143785476685, -0.9157952666282654, -0.5255448818206787, -0.9647895097732544, -0.09635470807552338, -0.02274962142109871, 0.541068434715271, 0.7147146463394165, 0.4541626274585724, -0.3198074996471405, -0.4179367423057556, 0.6017965078353882, 0.11741333454847336, 0.6221216917037964, -0.09352188557386398, -0.6599223613739014, 0.4210352599620819, 0.8117976784706116, -0.16324520111083984, 1.3369160890579224, 0.1491108..."
