# Preprocess

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rinoj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load DAtasets

In [2]:
pd.set_option('max_colwidth', 800)

TWEETS_PATH= "../data/raw_tweets_text.csv"
SENTIMENT_PATH="../data/t4sa_text_sentiment.tsv"

#load data
tweets_df = pd.read_csv(TWEETS_PATH, encoding='latin-1', header=0)
sentiment_df= pd.read_csv(SENTIMENT_PATH, sep='\t', header=0)

In [3]:
# Useful functions to help extract data from the columns

def extract_username_from_text(text):     # Extracts the username from retweets (e.g., "RT @user:").
    match = re.search(r'^RT @([^\s:]+):', text)
    return match.group(1) if match else None
     
def extract_links_from_text(text): # Extracts URLs from tweet text.
    urls = re.findall(r'https?://\S+', text)
    if not urls:
        return None
    return urls[0] if len(urls) == 1 else urls
           
def extract_hashtags_from_text(text): # Extract hashtags from the tweet text
    hashtags = re.findall(r'#\w+', text)
    if not hashtags:
        return None
    return hashtags[0] if len(hashtags) == 1 else hashtags
    
def extract_mentions_from_text(text): # Extract mentions from the tweet text
    cleaned_text = re.sub(r'^RT @[^\s:]+: ', '', text) # Remove the initial retweet username (e.g., "RT @user:")
    mentions = re.findall(r'@\w+', cleaned_text)
    if not mentions:
        return None
    return mentions[0] if len(mentions) == 1 else mentions



def clean_tweet_text(text: str):
    """
    Cleans the tweet text for EDA by removing noise such as:
    - Retweet prefixes (RT @user:)
    - URLs
    - HTML entities (e.g., &amp;)
    - Extra whitespace
    - Remove mentions
    - remove # symbol
    """
    # Remove retweet header
    text = re.sub(r'^RT @[^\s:]+: ', '', text)
    
    # Remove URLs
    # Remove URLs including malformed/truncated ones (e.g., "httpsâ")
    text = re.sub(r'https?\S+|www\.\S+', '', text)
    
    # Remove HTML entities like &amp;
    text = re.sub(r'&\w+;', '', text)
    
    # Remove extra spaces and trim
    text = re.sub(r'\s+', ' ', text).strip()

    #remove metions
    text = re.sub(r'@\w+', '', text)
    # remove hashtags
    # text = re.sub(r'#', '\w', text)
    text = text.replace("#","", -1)
    
    return text

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
# nltk.download('stowords')
def rm_stop_words(tweet):
    tokenized_tw = word_tokenize(tweet)
    stop_words = set(stopwords.words('english'))
    result = [t for t in tokenized_tw if t.lower() not in stop_words ]
    return " ".join(result)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rinoj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
merged_df = pd.merge(tweets_df, sentiment_df, left_on='id', right_on='TWID')
merged_df = merged_df.drop(columns=['TWID']) # since its alr in id

main_df = merged_df.copy()
# Start adding extra columns that might help us with visualizations
# main_df['is_retweet'] = main_df['text'].str.startswith('RT ')
# main_df['username'] = main_df['text'].apply(extract_username_from_text)
# # main_df['urls'] = main_df['text'].apply(extract_links_from_text)
# main_df['hashtags'] = main_df['text'].apply(extract_hashtags_from_text)
# main_df['mentions'] = main_df['text'].apply(extract_mentions_from_text)

main_df['text'] = main_df['text'].apply(clean_tweet_text)
main_df['text'] = main_df['text'].apply(rm_stop_words)
classes=['NEG', 'POS', 'NEU']
main_df['class'] = main_df[classes].idxmax(axis=1)
cols_to_drop = ['id', 'NEG', 'POS', 'NEU'  ]
main_df = main_df.drop(columns=cols_to_drop, axis=1)
main_df


Unnamed: 0,text,class
0,Josh Jenkins looking forward TAB Breeders Crown Super Sunday,POS
1,[ Pic ] Nichkhun krjeong86 's IG,NEU
2,Congratulations Pakistan becoming No1TestTeam world odds ! JI_PakZindabadRallies,POS
3,"September , taking Maine Mendozaâs surprise thanksgiving party threw fans !",POS
4,Incredible India Atulya Bharat - Land Seekers BeProud ð ð®ð³ : | : Plz RT,NEU
...,...,...
1179952,morning girls wonderful Friday,POS
1179953,RT Follow Colin Kaepernick debated merits Castro'sâ¦ - Mercury News,NEU
1179954,live webcam find download app,NEU
1179955,Pearl Roadshow 4-piece Complete Drum Set Cymb,NEU


In [6]:
saved_path = "../data/processed_tweets_sentiment.csv"
main_df.to_csv(saved_path, index=False)

In [7]:
!pip install sentence-transformers



In [8]:
from sentence_transformers import SentenceTransformer
# Initialize the SBERT model
sbert = SentenceTransformer('bert-base-nli-mean-tokens')
sample_size = 5000
random_state = 33
neu = main_df[main_df['class'] == 'NEU'].sample(n=sample_size, random_state=random_state)
neg = main_df[main_df['class'] == 'NEG'].sample(n=sample_size, random_state=random_state)
pos = main_df[main_df['class'] == 'POS'].sample(n=sample_size, random_state=random_state)
# subset = main_df.head(1000)
subset = pd.concat([neu,neg, pos], ignore_index=True)

# tweet_embeddings = sbert.encode(subset['text'])


In [9]:
# subset['embeddings'] = tweet_embeddings.tolist()


In [10]:
# save_path="../project_data/t4sa_data.csv"
# subset.to_csv(save_path, index=False)

# Embedding Using Word2vec

In [11]:
! pip install gensim



In [12]:
dataset = main_df.loc[main_df['class'].isin(['NEG', 'NEU', 'POS'])]
dataset.columns

Index(['text', 'class'], dtype='object')

In [13]:

# tokenized_tw = word_tokenize(tweet)
dataset["tokenized_text"] = dataset["text"].apply(lambda x: word_tokenize(x))

In [14]:
# dataset["tokenized_text"]
dataset

Unnamed: 0,text,class,tokenized_text
0,Josh Jenkins looking forward TAB Breeders Crown Super Sunday,POS,"[Josh, Jenkins, looking, forward, TAB, Breeders, Crown, Super, Sunday]"
1,[ Pic ] Nichkhun krjeong86 's IG,NEU,"[[, Pic, ], Nichkhun, krjeong86, 's, IG]"
2,Congratulations Pakistan becoming No1TestTeam world odds ! JI_PakZindabadRallies,POS,"[Congratulations, Pakistan, becoming, No1TestTeam, world, odds, !, JI_PakZindabadRallies]"
3,"September , taking Maine Mendozaâs surprise thanksgiving party threw fans !",POS,"[September, ,, taking, Maine, Mendozaâs, surprise, thanksgiving, party, threw, fans, !]"
4,Incredible India Atulya Bharat - Land Seekers BeProud ð ð®ð³ : | : Plz RT,NEU,"[Incredible, India, Atulya, Bharat, -, Land, Seekers, BeProud, ð, ð®ð³, :, |, :, Plz, RT]"
...,...,...,...
1179952,morning girls wonderful Friday,POS,"[morning, girls, wonderful, Friday]"
1179953,RT Follow Colin Kaepernick debated merits Castro'sâ¦ - Mercury News,NEU,"[RT, Follow, Colin, Kaepernick, debated, merits, Castro'sâ¦, -, Mercury, News]"
1179954,live webcam find download app,NEU,"[live, webcam, find, download, app]"
1179955,Pearl Roadshow 4-piece Complete Drum Set Cymb,NEU,"[Pearl, Roadshow, 4-piece, Complete, Drum, Set, Cymb]"


In [15]:
# reference https://radimrehurek.com/gensim/models/word2vec.html
from gensim.test.utils import lee_corpus_list
from gensim.models import Word2Vec
sentences = dataset['tokenized_text']

vector_size=10
window=5
min_count=1
workers=4

w2v = Word2Vec(sentences,vector_size=vector_size, window=window, min_count=min_count, workers=workers, sg=1)

In [16]:
def create_w2v_embeddings(row, model):
    e = []
    for token in row:
        # e.append(np.mean( model.wv[token]))
        # e.append(np.mean())
        e.append( model.wv[token])
    return e
dataset['embeddings'] = dataset["tokenized_text"].apply(lambda row: create_w2v_embeddings(row, w2v))


In [17]:

dataset['embeddings_flatten'] = dataset["embeddings"].apply(lambda row: np.array(row).reshape(-1).tolist())

In [18]:
dataset.head()

Unnamed: 0,text,class,tokenized_text,embeddings,embeddings_flatten
0,Josh Jenkins looking forward TAB Breeders Crown Super Sunday,POS,"[Josh, Jenkins, looking, forward, TAB, Breeders, Crown, Super, Sunday]","[[0.3582419, 0.18251947, 1.9217814, -0.20445539, 1.1283869, 0.7661418, 1.3497233, 0.23502421, -1.166703, -0.70630544], [-0.08354035, 0.6222627, 1.4820064, -0.1301001, 1.2373321, 1.0689685, 1.3503928, 0.049593333, -1.384748, -0.27793354], [0.12135473, -1.1814454, 0.6621838, -0.12810785, 1.554187, -0.021069277, 1.6151189, 0.045077346, -0.8355202, -1.620781], [0.41514114, -1.466933, 0.9285544, 0.0059803627, 0.5874255, 0.061921105, 2.0992165, -0.18720406, -1.3102454, -2.0553162], [-0.8593822, 0.58975685, 0.066101335, -1.0958154, 0.80332154, 1.0839456, -0.103754066, 1.450351, -1.0473042, -0.8947164], [-0.18361805, -0.15170948, 0.26730743, -0.2519849, 0.36682746, 0.5033006, 0.49415028, 0.5473637, -0.6746402, -0.5838819], [0.12132321, 0.05763937, 0.31952178, -0.8365723, 0.8706728, 0.9711598, ...","[0.35824188590049744, 0.18251946568489075, 1.9217814207077026, -0.2044553905725479, 1.1283868551254272, 0.7661417722702026, 1.3497233390808105, 0.23502421379089355, -1.1667029857635498, -0.7063054442405701, -0.08354035019874573, 0.622262716293335, 1.4820064306259155, -0.1301001012325287, 1.2373321056365967, 1.0689685344696045, 1.3503928184509277, 0.04959333315491676, -1.3847479820251465, -0.2779335379600525, 0.12135472893714905, -1.1814453601837158, 0.6621838212013245, -0.12810784578323364, 1.5541870594024658, -0.021069277077913284, 1.6151188611984253, 0.04507734626531601, -0.8355202078819275, -1.6207809448242188, 0.41514113545417786, -1.466933012008667, 0.9285544157028198, 0.0059803626500070095, 0.5874254703521729, 0.06192110478878021, 2.0992164611816406, -0.18720406293869019, -1.3102..."
1,[ Pic ] Nichkhun krjeong86 's IG,NEU,"[[, Pic, ], Nichkhun, krjeong86, 's, IG]","[[-0.75351024, 1.2079241, 1.2369695, -1.2387527, 1.5484711, 0.6237923, 0.2972262, -0.16343136, -2.142214, -0.88716614], [0.046789777, 0.11305293, 1.0064615, -0.8896418, 1.1400238, 0.81399894, 0.65282476, 0.16643396, -1.425934, -0.10510034], [-0.84279376, 1.2765069, 1.3343263, -1.2455893, 1.6275268, 0.65181994, 0.16332741, -0.21489285, -2.1501064, -0.91703606], [-1.0716783, 0.5522413, 1.3653101, -0.49872786, 1.566279, 1.0774457, 0.09379782, -0.26345158, -1.1228722, -0.74370074], [-0.21502753, -0.070828564, 0.05452549, -0.022869254, 0.1897666, 0.23406692, -0.06490679, -0.06895785, -0.10261813, -0.17049238], [-0.10359563, 0.016930845, 1.2278746, -0.6906755, 1.3840696, -0.027641848, 1.8159302, 0.2755489, -0.7837052, -1.1959889], [-0.45797083, 0.004340074, 1.4885658, -0.6926841, 2.0987225, ...","[-0.7535102367401123, 1.2079241275787354, 1.2369694709777832, -1.2387527227401733, 1.5484710931777954, 0.623792290687561, 0.2972261905670166, -0.16343136131763458, -2.142214059829712, -0.8871661424636841, 0.04678977653384209, 0.11305292695760727, 1.006461501121521, -0.8896418213844299, 1.1400238275527954, 0.8139989376068115, 0.6528247594833374, 0.16643396019935608, -1.4259339570999146, -0.10510034114122391, -0.8427937626838684, 1.2765069007873535, 1.3343262672424316, -1.245589256286621, 1.6275267601013184, 0.6518199443817139, 0.1633274108171463, -0.21489284932613373, -2.150106430053711, -0.9170360565185547, -1.0716782808303833, 0.552241325378418, 1.3653100728988647, -0.49872785806655884, 1.5662790536880493, 1.0774457454681396, 0.09379781782627106, -0.26345157623291016, -1.1228722333908..."
2,Congratulations Pakistan becoming No1TestTeam world odds ! JI_PakZindabadRallies,POS,"[Congratulations, Pakistan, becoming, No1TestTeam, world, odds, !, JI_PakZindabadRallies]","[[0.29574108, -0.6669113, 1.8421237, -0.8171102, -0.031163124, 0.5472434, 1.3864442, 0.24019828, -1.1582527, -0.87545305], [-0.29139698, -0.6549019, 1.9955342, 1.038911, 0.5099396, -0.3953713, 1.210235, 0.90510356, -1.6559097, -0.78929925], [-0.07963996, -0.44993085, 1.3783813, 0.19470294, 0.95776653, -0.36575076, 1.8659443, 0.13527583, -1.2024026, -1.4889848], [-0.1921801, -0.18349788, 0.27471122, 0.09712645, 0.07022549, 0.21517803, 0.16847639, 0.23396118, -0.24593984, -0.3038698], [-0.22814901, -1.1873789, 1.3085599, 0.0013433842, 1.3005759, -0.4244075, 1.7852299, 0.32342303, -0.9904377, -1.1181335], [0.17874052, 0.30164796, 1.825731, -0.15549201, -0.14732185, -0.2105276, 0.6745168, 1.4087052, -1.4826243, -2.4882586], [-0.5291331, -1.0183396, 1.0233821, -1.7716651, 0.8381349, -0.2602...","[0.29574108123779297, -0.6669113039970398, 1.8421237468719482, -0.8171101808547974, -0.03116312436759472, 0.5472434163093567, 1.3864442110061646, 0.2401982843875885, -1.1582527160644531, -0.8754530549049377, -0.29139697551727295, -0.654901921749115, 1.9955341815948486, 1.038910984992981, 0.5099396109580994, -0.39537128806114197, 1.2102349996566772, 0.9051035642623901, -1.6559096574783325, -0.7892992496490479, -0.07963995635509491, -0.4499308466911316, 1.3783812522888184, 0.19470293819904327, 0.9577665328979492, -0.3657507598400116, 1.865944266319275, 0.13527582585811615, -1.2024025917053223, -1.4889848232269287, -0.1921800971031189, -0.18349787592887878, 0.2747112214565277, 0.09712644666433334, 0.07022549211978912, 0.21517802774906158, 0.1684763878583908, 0.23396117985248566, -0.245939..."
3,"September , taking Maine Mendozaâs surprise thanksgiving party threw fans !",POS,"[September, ,, taking, Maine, Mendozaâs, surprise, thanksgiving, party, threw, fans, !]","[[0.8757176, -0.22710256, 1.2977134, -0.92255926, 0.75159425, 0.2858534, 0.91067344, 0.9320895, -2.2788043, -0.8497662], [-0.18466692, -0.42737764, 0.80137444, -0.8757653, 1.1723027, -0.30942354, 1.8119118, 0.6217869, -0.9328711, -0.5869429], [0.21515174, -0.8582328, 1.0365127, 0.124528915, 1.6589627, -0.16205753, 1.475525, 0.16153476, -1.1815171, -1.3071073], [0.19786112, -1.0800291, 1.0254871, -0.15952763, 0.7868692, 0.49996844, 1.6747888, 0.177848, -1.2682748, -0.030566761], [-0.20693903, -0.1425525, 0.11037476, -0.050154068, 0.103537165, 0.0870878, 0.12766601, 0.006654159, -0.1803273, -0.18978906], [0.32825062, -1.0051061, 1.117627, -0.640304, 1.2921016, 0.090497196, 1.5131004, 0.15490903, -0.93895894, -1.4698979], [-0.10185009, -2.3384058, 1.4714514, -0.8288627, 1.2568208, 0.20921...","[0.8757175803184509, -0.22710256278514862, 1.2977133989334106, -0.9225592613220215, 0.7515942454338074, 0.28585338592529297, 0.9106734395027161, 0.9320895075798035, -2.278804302215576, -0.849766194820404, -0.18466691672801971, -0.4273776412010193, 0.8013744354248047, -0.875765323638916, 1.1723027229309082, -0.3094235360622406, 1.8119118213653564, 0.621786892414093, -0.9328711032867432, -0.5869429111480713, 0.21515174210071564, -0.8582327961921692, 1.0365127325057983, 0.1245289146900177, 1.6589627265930176, -0.1620575338602066, 1.4755250215530396, 0.16153475642204285, -1.1815171241760254, -1.3071073293685913, 0.19786112010478973, -1.0800291299819946, 1.0254870653152466, -0.15952762961387634, 0.7868692278862, 0.49996843934059143, 1.6747888326644897, 0.1778479963541031, -1.268274784088134..."
4,Incredible India Atulya Bharat - Land Seekers BeProud ð ð®ð³ : | : Plz RT,NEU,"[Incredible, India, Atulya, Bharat, -, Land, Seekers, BeProud, ð, ð®ð³, :, |, :, Plz, RT]","[[-0.007017922, -0.31040654, 0.21915065, -0.5432963, 1.1614581, 0.026545187, 1.6878189, -0.21792032, -1.8061135, -0.5831171], [-0.05280003, 0.01622403, 0.9405678, 0.5864342, 0.595899, -0.42388555, 1.307837, 0.9915634, -1.7346205, -0.9626937], [-0.23733006, -0.05695138, 0.0804559, -0.024779819, -0.0017034421, 0.11570151, 0.0906876, 0.18192142, -0.23461735, -0.14229101], [-0.43987876, -0.4290799, 1.1709914, 0.06119595, 0.3533217, 0.017075399, 0.9968376, 0.28389737, -1.1724489, -0.3156542], [0.05540823, 0.35789853, 0.40076473, -0.8607364, 1.1306843, -0.13502842, 1.118921, 0.730672, -1.9517145, -0.6257973], [0.093694694, 0.2231485, 0.21807526, -0.19467872, 0.9760077, 0.4575416, 1.7902254, 0.6537567, -1.9481393, -0.20493644], [-0.3506566, -0.1843744, 0.14115794, -0.2570525, 0.3150384, 0.237...","[-0.00701792212203145, -0.31040653586387634, 0.21915064752101898, -0.5432962775230408, 1.161458134651184, 0.026545187458395958, 1.6878188848495483, -0.21792031824588776, -1.8061134815216064, -0.5831171274185181, -0.05280002951622009, 0.016224030405282974, 0.9405677914619446, 0.5864341855049133, 0.5958989858627319, -0.4238855540752411, 1.3078370094299316, 0.9915633797645569, -1.734620451927185, -0.9626936912536621, -0.23733006417751312, -0.056951381266117096, 0.08045589923858643, -0.024779818952083588, -0.0017034420743584633, 0.11570151150226593, 0.09068760275840759, 0.18192142248153687, -0.23461735248565674, -0.14229100942611694, -0.43987876176834106, -0.42907989025115967, 1.1709914207458496, 0.06119595095515251, 0.35332170128822327, 0.017075398936867714, 0.9968376159667969, 0.28389737..."


In [19]:

save_path="../project_data/t4sa_data_w2v_sg_test.csv"
# print(dataset.shape)
small_dataset = dataset.head(50000)
np.savez_compressed(save_path.replace('.csv', '.npz'), data=small_dataset.to_numpy())

In [20]:
slim_dataset= dataset[[ 'class', 'embeddings_flatten']]


np.savez_compressed("../project_data/t4sa_data_w2v_sg_slim.npz", data=slim_dataset.to_numpy())

dpncc_datatset = dataset[[ 'class', 'embeddings']]
np.savez_compressed("../project_data/t4sa_data_w2v_sg_dpcnn.npz", data=dpncc_datatset.to_numpy())
 