In [None]:
import pandas as pd
import re
import emoji
import contractions
from tqdm.auto import tqdm
tqdm.pandas()
pd.set_option('display.max_colwidth', None)

In [None]:
# dataset downloaded from kaggle: https://www.kaggle.com/datasets/saurabhshahane/cyberbullying-dataset
df = pd.read_csv("path/Dataset/twitter_parsed_dataset.csv")

## Preprocessing

In [None]:
# Chat words List From https://www.kaggle.com/code/sndpkirwai/nlp-basic-text-preprocessing-steps/notebook + I added some more
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My ASS Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The ASS
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My Ass Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The Fuck
WTG=Way To Go
WUF=Where Are You From
W8=Wait
7K=Sick:-D Laugher
"""

chat_words_str += """
AIGHT=Alright
AYT=Alright
BFF=Best Friends Forever
BRO=Brother
COZ=Because
CUZ=Because
DM=Direct Message
DEETS=Details
EM=Them
EMO=Emotional
FML=Fuck My Life
FTW=For The Win
FTL=For The Loss
GRATS=Congratulations
GR8=Great
GTG=Got To Go
G2G=Got To Go
HBU=How About You
HMU=Hit Me Up
IDC=I Do Not Care
IDK=I Do Not Know
ILY=I Love You
IMU=I Miss You
JS=Just Saying
JK=Just Kidding
K=Okay
KK=Okay
L8=Late
LUV=Love
MSG=Message
MYOB=Mind Your Own Business
NBD=No Big Deal
NGL=Not Gonna Lie
NP=No Problem
NSFW=Not Safe For Work
NVM=Never Mind
OBV=Obviously
OMG=Oh My God
OMFG=Oh My Fucking God
PLS=Please
PLZ=Please
PPL=People
RLY=Really
RN=Right Now
RU=Are You
SMH=Shaking My Head
SRSLY=Seriously
SUP=What's up
TBH=To Be Honest
TGIF=Thank God It's Friday
TMI=Too Much Information
TTYS=Talk To You Soon
TY=Thank You
TYT=Take Your Time
VC=Voice Chat
WBU=What About You
WDYM=What Do You Mean
WTH=What The Hell
XOXO=Hugs and Kisses
YA=You
YO=Hey
YOLO=You Only Live Once
YUP=Yes
ZOMG=Oh My God
IDGAF=I Do Not Give A Fuck
"""

In [None]:
# Function to convert chat words from the same notebook
chat_words_map_dict = {}
chat_words_list = []

for line in chat_words_str.split("\n"):
    if line.strip() != "":
        cw = line.split("=")[0].lower()
        cw_expanded = line.split("=")[1].lower()
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded

chat_words_list = set(chat_words_list)
def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        match = re.match(r"([a-zA-Z]+)(\W*)", w)  # ex: "brb," -> ("brb", ",")
        if match:
            word, punctuation = match.groups()
            lw = word.lower()
            if lw in chat_words_list:
                new_text.append(chat_words_map_dict[lw] + punctuation)
            else:
                new_text.append(w)
        else:
            new_text.append(w)
    return " ".join(new_text)
chat_words_conversion("one minute brb,")

'one minute be right back,'

In [None]:
# function to remove mentions at the beginning and end
def remove_boundary_users(text):
    words = text.split()
    # Remove <user> at the beginning
    if words and words[0] == "<user>":
        words = words[1:]
    # Remove <user> at the end
    if words and words[-1] == "<user>":
        words = words[:-1]
    return " ".join(words)


In [None]:
def prepro(text: str) -> str:
    """
    Clean and normalize raw social media text.

    Steps:
     1. HTML artifacts replacement
     2. Remove URLs, emojis, trailing hashtags
     3. Mask mentions and inline hashtags
     4. Handle punctuation and RT tokens
     5. Normalize user tokens and repeated words
     6. Anonymize numbers and specific names
     7. Remove unwanted tokens and punctuation repeats
     8. Linguistic corrections: expand contractions, convert chat words, fix spelling
     9. Final cleanup: lowercase, boundary users, whitespace normalization
    """
    # 1. HTML artifacts replacement
    text = text.replace("&amp", "and")

    # 2. Fundamental removals: URLs, emojis, trailing hashtags
    text = re.sub(r"http\S+", "", text)
    text = emoji.replace_emoji(text, "")
    text = re.sub(r"(\s+#\S+)+\s*$", "", text)

    # 3. Mask mentions and remove '#' from inline hashtags
    text = re.sub(r"@\w+", "<user>", text)
    text = re.sub(r"#", "", text)

    # 4. Keep strong punctuation only and remove RT tokens
    text = re.sub(r"[^\w\s<>.!?]", "", text)
    text = re.sub(r"^RT\s+", "", text)
    text = re.sub(r"\s+(RT|RTvid)\s*$", "", text)
    text = re.sub(r"\bRT\b", "", text, flags=re.IGNORECASE)

    # 5. Normalize repeated <user> tokens and repeated words
    text = re.sub(r'(<user>\s*)+', '<user> ', text)
    text = re.sub(r"\b(\S+)( \1\b)+", r"\1 ", text)

    # 6. Anonymize numbers and specific names
    text = re.sub(r'(?<=[a-zA-Z])\d+(?=\b)', '', text)
    text = re.sub(r'\d+(?:\.\d+)?(?:st|nd|rd|th|s|k|m|to|x|xs|cm|in|id)?', '<num>', text)
    text = re.sub(r"\b(max|maxs|valentis|drasko|drasco|druitts|katandandre|katjia|khybar|kobane|oktars|tomz|nickis|nikkis|kat|katie|nikki|colin|emma|lyn|lynn|andre)\b", '<name>', text, flags=re.IGNORECASE)

    # 7. Remove unnecessary tokens and punctuation repetitions
    text = re.sub(r"\bmkr\b", "", text)
    text = re.sub(r'([!?.])(?:\s*([!?.]\s*))+', r' \1 ', text)
    text = re.sub(r"_+", " ", text)

    # 8. Linguistic corrections
    text = contractions.fix(text)          # Expand contractions (e.g., "don't" -> "do not")
    text = chat_words_conversion(text)     # Convert chat abbreviations (e.g., "u" -> "you")

    # 9. Final cleanup: lowercase, strip boundary <user>, normalize spaces
    text = text.lower()
    text = remove_boundary_users(text)
    text = re.sub(r"\s+", " ", text)

    return text.strip()

# Example usage
sample_text = "you are a #loser @user ! see you tomorrow with 10 euros !!! and .. jean =you owe me 10k https://example.com 😊 #fun, #lol"
print(prepro(sample_text))

you are a loser <user> ! see you tomorrow with <num> euros ! and . jean you owe me <num>


In [729]:
# Fill NaN values in the 'Text' column with an empty string
df["Text"] = df["Text"].fillna("")

# Apply the preprocessing function
df["Text"] = df["Text"].progress_apply(prepro)

100%|██████████| 16851/16851 [00:01<00:00, 11762.88it/s]


In [730]:
# Glued tokens
def unpack_glued_tokens(text):
    glued_token_mapping = {
    "adviceforyoungfeminists": "advice for young feminists",
    "allfemale": "all female",
    "cuntandandre": "cunt and andre",
    "dudebros": "dude bros",
    "femfreefriday": "feminist free friday",
    "feministsareugly": "feminists are ugly",
    "femshep": "feminist shep",
    "hatefilled": "hate filled",
    "heforshe": "he for she",
    "ideaology": "ideology",
    "islamofascist": "islamo fascist",
    "islamofascists": "islamo fascists",
    "islamolunatic": "islamo lunatic",
    "islamolunatics": "islamo lunatics",
    "killerblondes": "killer blondes",
    "likeagirl": "like a girl",
    "murderbigotry": "murder bigotry",
    "nonmuslims": "non muslims",
    "notsexist": "not sexist",
    "questionsformen": "questions for men",
    "redscarebot": "red scare bot",
    "selfawareness": "self awareness",
    "sorryitsaboy": "sorry it is a boy",
    "womenagainstfeminism": "women against feminism",
    "yesallwomen": "yes all women",
    "idontneedfeminism": "i do not need feminism",
    "amirite": "am i right",
    "everydaysexism": "every day sexism",
    "yearolds": "year old",
    "yearold": "year old",
    "yrold": "year old",
    "allmale": "all male",
    "feminazi": "feminist nazi",
    "gangraped": "gang raped",
    "nonmuslim": "non muslim",
    "nosexist": "not sexist",
    "promogirls": "promote girls",
    "tweetlikeafeminist": "tweet like a feminist",
    "twitterfeminism": "twitter feminism",
    'letstalkmen': 'lets talk men'
    }
    pattern = re.compile(r'\b(' + '|'.join(glued_token_mapping.keys()) + r')\b')
    return pattern.sub(lambda m: glued_token_mapping[m.group(0)], text)

df["Text"] = df["Text"].progress_apply(unpack_glued_tokens)


100%|██████████| 16851/16851 [00:00<00:00, 90926.09it/s]


In [None]:
def correct_orthographic_errors(text):
    orthographic_mapping = {
        'allahs': 'allah',
        'anamists': 'animist',
        'apostacy': 'apostasy',
        'biatch': 'bitch',
        'biggots': 'bigot',
        'blumenthals': 'blumenthal',
        'christiandom': 'christian',
        'colonialized': 'colonized',
        'dck': 'dick',
        'deash': 'daesh',
        'douch': 'douche',
        'femal': 'female',
        'gende': 'gender',
        'hamaz': 'hamas',
        'islams': 'islam',
        'kuffir': 'kafir',
        'kunt': 'cunt',
        'mohammeds': 'mohammed',
        'pedophelia': 'pedophilia',
        'probs': 'probably',
        'punnished': 'punished',
        'sammich': 'sandwich',
        'sexists': 'sexist',
        'shutup': 'shut up',
        'taquiyya': 'taqiyya',
        'tradie': 'worker',
        'wheras': 'whereas',
        'spatchcock': 'chicken',
        'wayyy': 'way',
        'faaark': 'fuck',
        'ablazing': 'blazing',
        'aggres': 'aggressive',
        'balistic': 'ballistic',
        'blaspemy': 'blasphemy',
        'budhists': 'buddhists',
        'burrying': 'burying',
        'carnt': "can not",
        'cmon': 'come on', 
        'digusting': 'disgusting',
        'excitin': 'exciting',
        'fkn': 'fucking',
        'genuinly': 'genuinely',
        'judism': 'judaism',
        'litteraly': 'literally',
        'mohommed': 'mohammed',
        'palistine': 'palestine',
        'peacful': 'peaceful',
        'percieved': 'perceived',
        'wemon': 'women',
        'judism': 'judaism',
        'annoyi': 'annoying',
        'aparthide': 'apartheid',
        'apharthide': 'apartheid',
        'areseholes': 'asshole',
        'arseholes': 'asshole',
        'argmnt': 'argument',
        'womem': 'women',
        'isil': 'isis',
        'islamophobe': 'islam hate',
        'retweet': 'tweet'
    }
    
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in orthographic_mapping.keys()) + r')\b')
    
    # replace function to handle case insensitivity
    def replace_match(match):
        matched_word = match.group(0)
        lower_word = matched_word.lower()
        replacement = orthographic_mapping[lower_word]
        return replacement
    
    return pattern.sub(replace_match, text)

df["Text"] = df["Text"].progress_apply(correct_orthographic_errors)

100%|██████████| 16851/16851 [00:01<00:00, 15432.83it/s]


In [None]:
def correct_final_mapping(text):
    final_token_mapping = {
        # Expressive/emotional tokens
        'arghhhhh': 'angry',
        'gaaaaah': 'fuck', 
        'grrrrrr': 'grr',
        'hahaha': 'haha',
        'hahahaha': 'haha',
        'lololol': 'lol',
        'roflmao': 'lol',
        'ehhhh': 'disgusting',
        
        # Gender-related terms
        'bimbolines': 'bimbo',
        'fems': 'feminists',
        'misandrist': 'man hater',
        'pussies': 'cowards',
        'radfems': 'feminists',
        'sjws': 'social justice activists',
        
        # Insults/profanity
        'dipwad': 'idiot',
        'douchebag': 'jerk', 
        'dumbass': 'fool',
        'horseshit': 'nonsense',
        'microbrain': 'idiot',
        'skanks': 'bitch',
        
        # Political/sensitive terms
        'daesh': 'isis',
        'daeshbag': 'terrorist',
        'daeshbags': 'terrorists',
        'gamergate': 'gamer controversy',
        'naziphobia': 'fear of nazi',
        'tcot': 'top conservatives',
        'mras': 'mens rights activists',
        
        # Ethnic terms
        'ezidi': 'yazidi',
        'ezidis': 'yazidis',
        
        # Miscellaneous
        'deadset': 'determined',
        'krazyeyes': 'crazy',
        'mannnn': 'man',
        'nomorepage': 'feminist controversy',
        'selfie': 'self picture',
        'spatchcock': 'chicken',
        'unfollow': 'stop following'
    }
    
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in final_token_mapping.keys()) + r')\b')
    
    # Fonction de remplacement qui préserve la casse
    def replace_match(match):
        matched_word = match.group(0)
        lower_word = matched_word.lower()
        replacement = final_token_mapping[lower_word]
        return replacement
    
    return pattern.sub(replace_match, text)

df["Text"] = df["Text"].progress_apply(correct_final_mapping)

100%|██████████| 16851/16851 [00:00<00:00, 27018.85it/s]


In [None]:
# keep only sentences with more than 4 words without counting <user> token
df = df[df["Text"].apply(lambda x: len([word for word in x.split() if word != "<user>"]) >= 4)]

In [734]:
df_cleaned = df.drop(['id', 'index', 'oh_label'], axis=1)
df_cleaned = df_cleaned.dropna(subset=['Annotation'])
df_cleaned = df_cleaned.rename(columns={'Annotation': 'label', 'Text': 'text'})

#reset index
df_cleaned.reset_index(drop=True, inplace=True)

In [735]:
df_cleaned['label'].value_counts()

label
none      10387
sexism     3258
racism     1965
Name: count, dtype: int64

In [None]:
# This dataset will be used for the training of our LaTextGAN model.

df_cleaned.to_pickle("path/Dataset/New_Preprocessed_Dataset_GAN.pkl")

## SBERT Embedding for cyberbullying detection

In [None]:
df_cleaned = pd.read_pickle("path/Dataset/New_Preprocessed_Dataset_GAN.pkl")

In [None]:
# completely remove punctuation and keep one space between words
def remove_punctuation(text):
    # Remove punctuation and replace with space
    text = re.sub(r'[^\w\s<>]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()
# usage example
text = "<user> tell you in no.context, lol !"
print(remove_punctuation(text)) 

<user> tell you in no context lol


In [748]:
df_cleaned['text'] = df_cleaned['text'].apply(remove_punctuation)

In [None]:
# import sbert
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [750]:
df_sbert = df_cleaned.copy()
df_sbert['embedding'] = df_sbert['text'].apply(lambda x: model.encode(x))

In [None]:
# this dataset will be used for the classification task

df_sbert.to_pickle("path/Dataset/New_Preprocessed_Dataset.pkl")