In [110]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [111]:
pd.set_option('max_colwidth', 800)

TWEETS_PATH= "../data/raw_tweets_text.csv"
SENTIMENT_PATH="../data/t4sa_text_sentiment.tsv"

#load data
tweets_df = pd.read_csv(TWEETS_PATH, encoding='latin-1', header=0)
sentiment_df= pd.read_csv(SENTIMENT_PATH, sep='\t', header=0)

# Basic Information

In [112]:
print(tweets_df.shape)
print(sentiment_df.shape)

(3452663, 2)
(1179957, 4)


In [113]:
print(tweets_df.head())
print(sentiment_df.head())

                   id                                                                                                                                            text
0  758014713804587008  RT @polarcomic: And surprise! the #RegularShow #38 has a short story written and illustrated (and even lettered) by me. https://t.co/kCctJpâ¦
1  758014717990428672                                             RT @SweetBabyBellB: My unproblematic fav who knows Bellarke is fucking real https://t.co/A9RK5b0Hfm
2  758014646716665857                                                                    RT @WhyLarryIsReal: I mean we know harry isn't human https://t.co/fW2TEwSHEq
3  758014655071526912          RT @Eastbay: She's ready, resilient, and on our latest cover. Snag a copy to find out more about @crysdunn_19. https://t.co/j4JwiEgCmd
4  758014642526429184                             RT @SheeeRatchet: find someone who loves you as much as Pikachu loves his bottle of ketchup https://t.co/pbIoDo9bfy
    

In [114]:
print(tweets_df.describe())
print(sentiment_df.describe())

                 id
count  3.452663e+06
mean   7.865426e+17
std    1.381574e+16
min    7.580146e+17
25%    7.695707e+17
50%    7.862700e+17
75%    7.993751e+17
max    8.046194e+17
               TWID           NEG           NEU           POS
count  1.179957e+06  1.179957e+06  1.179957e+06  1.179957e+06
mean   7.860716e+17  1.214643e-01  5.272504e-01  3.512854e-01
std    1.386547e+16  2.489799e-01  3.953345e-01  3.879292e-01
min    7.680969e+17  2.930239e-14  2.250815e-03  2.441870e-14
25%    7.692905e+17  1.118029e-02  8.237851e-02  7.404817e-02
50%    7.839379e+17  1.924086e-02  8.548171e-01  1.057651e-01
75%    7.996407e+17  3.546559e-02  8.904971e-01  8.860867e-01
max    8.046194e+17  9.939882e-01  1.000000e+00  9.965788e-01


# Cleaning the Data

In [115]:


print("Duplicate amounts in tweets_df:")
print(tweets_df['id'].duplicated().sum()) # There are no duplicates in either dataset

print("Duplicate amounts in sentiment_df:")
print(sentiment_df.duplicated().sum())

# Check for missing values in tweets_df
print("Missing values in tweets_df:")
print(tweets_df.isna().sum())

# Check for missing values in sentiment_df
print("\nMissing values in sentiment_df:")
print(sentiment_df.isna().sum())



Duplicate amounts in tweets_df:
0
Duplicate amounts in sentiment_df:
0
Missing values in tweets_df:
id      0
text    0
dtype: int64

Missing values in sentiment_df:
TWID    0
NEG     0
NEU     0
POS     0
dtype: int64


In [116]:
merged_df = pd.merge(tweets_df, sentiment_df, left_on='id', right_on='TWID')
merged_df = merged_df.drop(columns=['TWID']) # since its alr in id

main_df = merged_df.copy()

In [None]:
# Useful functions to help extract data from the columns

def extract_username_from_text(text):     # Extracts the username from retweets (e.g., "RT @user:").
    match = re.search(r'^RT @([^\s:]+):', text)
    return match.group(1) if match else None
     
def extract_links_from_text(text): # Extracts URLs from tweet text.
    urls = re.findall(r'https?://\S+', text)
    if not urls:
        return None
    return urls[0] if len(urls) == 1 else urls
           
def extract_hashtags_from_text(text): # Extract hashtags from the tweet text
    hashtags = re.findall(r'#\w+', text)
    if not hashtags:
        return None
    return hashtags[0] if len(hashtags) == 1 else hashtags
    
def extract_mentions_from_text(text): # Extract mentions from the tweet text
    cleaned_text = re.sub(r'^RT @[^\s:]+: ', '', text) # Remove the initial retweet username (e.g., "RT @user:")
    mentions = re.findall(r'@\w+', cleaned_text)
    if not mentions:
        return None
    return mentions[0] if len(mentions) == 1 else mentions



def clean_tweet_text(text: str):
    """
    Cleans the tweet text for EDA by removing noise such as:
    - Retweet prefixes (RT @user:)
    - URLs
    - HTML entities (e.g., &amp;)
    - Extra whitespace
    """
    # Remove retweet header
    text = re.sub(r'^RT @[^\s:]+: ', '', text)
    
    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)
    
    # Remove HTML entities like &amp;
    text = re.sub(r'&\w+;', '', text)
    
    # Remove extra spaces and trim
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

['@djsunjunkie', '@cece_peniston', '@daiyonmusic']


In [None]:
# Start adding extra columns that might help us with visualizations
main_df['is_retweet'] = main_df['text'].str.startswith('RT ')
main_df['username'] = main_df['text'].apply(extract_username_from_text)
main_df['urls'] = main_df['text'].apply(extract_links_from_text)
main_df['cleaned_text'] = main_df['text'].apply(clean_tweet_text)
main_df['hashtags'] = main_df['text'].apply(extract_hashtags_from_text)
main_df['mentions'] = main_df['text'].apply(extract_mentions_from_text)

main_df


Unnamed: 0,id,text,NEG,NEU,POS,is_retweet,username,urls,hashtags,mentions
0,768097627686604801,Josh Jenkins is looking forward to TAB Breeders Crown Super Sunday https://t.co/antImqAo4Y https://t.co/ejnA78Sks0,0.008090,0.042331,0.949579,False,,"[https://t.co/antImqAo4Y, https://t.co/ejnA78Sks0]",,
1,768097631864102912,RT @2pmthailfans: [Pic] Nichkhun from krjeong86's IG https://t.co/5gcAcu9by7,0.014644,0.926557,0.058800,True,2pmthailfans,https://t.co/5gcAcu9by7,,
2,768097640278089729,RT @MianUsmanJaved: Congratulations Pakistan on becoming #No1TestTeam in the world against all odds! #JI_PakZindabadRallies https://t.co/1oâ¦,0.004939,0.029469,0.965591,True,MianUsmanJaved,https://t.co/1oâ¦,"[#No1TestTeam, #JI_PakZindabadRallies]",
3,768097627695042560,"RT @PEPalerts: This September, @YESmag is taking you to Maine Mendozaâs surprise thanksgiving party she threw for her fans! https://t.co/oXâ¦",0.006389,0.018663,0.974948,True,PEPalerts,https://t.co/oXâ¦,,@YESmag
4,768096868504969216,#Incredible #India #Atulya #Bharat - Land of Seekers #BeProud ð ð®ð³ :|: Plz RT https://t.co/vpghReZWsa,0.049398,0.861395,0.089207,False,,https://t.co/vpghReZWsa,"[#Incredible, #India, #Atulya, #Bharat, #BeProud]",
...,...,...,...,...,...,...,...,...,...,...
1179952,804618351179874304,@LizHudston @KymWyllie @Evasmiless @meanBok @linddyloo66 @Minna1971 morning girls have a wonderful #Friday https://t.co/unkV2p7JYF,0.006605,0.024448,0.968946,False,,https://t.co/unkV2p7JYF,#Friday,"[@LizHudston, @KymWyllie, @Evasmiless, @meanBok, @linddyloo66, @Minna1971]"
1179953,804618548312031232,#RT #Follow Colin Kaepernick debated merits of Castro'sâ¦ - The Mercury News https://t.co/XtrtILIfBl https://t.co/lvUwwdsD4b,0.028834,0.857713,0.113453,False,,"[https://t.co/XtrtILIfBl, https://t.co/lvUwwdsD4b]","[#RT, #Follow]",
1179954,804618716084391936,I am now live on webcam find me here &gt;&gt; https://t.co/yg0pJss4MK download our app &gt;&gt; here https://t.co/QMXtTx4Gcr https://t.co/qxJFN7ZO5U,0.018679,0.927865,0.053456,False,,"[https://t.co/yg0pJss4MK, https://t.co/QMXtTx4Gcr, https://t.co/qxJFN7ZO5U]",,
1179955,804618934158757889,Pearl Roadshow 4-piece Complete Drum Set with Cymb https://t.co/gQ2TdFKnma https://t.co/PL2FXvWRo9,0.019658,0.907034,0.073308,False,,"[https://t.co/gQ2TdFKnma, https://t.co/PL2FXvWRo9]",,
