In [21]:
import pandas as pd
import re
import emoji
import unicodedata

In [None]:
chunk1 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk1.csv')
chunk2 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk2.csv')
chunk3 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk3.csv')
chunk4 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk4.csv')
chunk5 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk5.csv')
chunk6 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk6.csv')

df = pd.concat([chunk1, chunk2, chunk3, chunk4, chunk5, chunk6], ignore_index=True)


Filter only sarcasm comment and long text

In [23]:
df = df[df['label'] == 1]
df = df[df['comment'].str.len() > 30]

Filter game comments

In [24]:
game_subreddits = [
    'gaming', 'pcgaming', 'games', 'leagueoflegends', 'Overwatch',
    'GlobalOffensive', 'FortNiteBR', 'PS4', 'xboxone', 'wow', 'nintendo', 'Minecraft'
]

df_game = df[df['subreddit'].isin(game_subreddits)]

Select only ```comment``` column

In [25]:
df_game = df_game[['comment']]

Handle duplicates

In [26]:
df_game.duplicated().sum()

np.int64(27)

In [27]:
df_game = df_game.drop_duplicates()

Check missing

In [28]:
df_game.isna().sum()

comment    0
dtype: int64

Normalize text

In [29]:
def normalize_text(text):
    text = text.lower()  # lowercase

    # remove 'early access review' at the beginning
    text = re.sub(r"^(early access review[\s:\-–—)]*)", "", text, flags=re.IGNORECASE)

    # remove URLs, mentions, hashtags
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)

    # remove numbers
    text = re.sub(r"\d+", "", text)

    # remove emojis and icons
    text = emoji.replace_emoji(text, replace='')

    # Remove spam patterns: no more than 3 consecutive identical characters
    # e.g., "goooood" → "good", "aaaaawesome" → "awesome"
    text = re.sub(r'(.)\1{3,}', r'\1\1\1', text)  # Keep up to 3 repeats
    
    # remove non-printable or control characters + double quotes
    text = ''.join(
        c for c in text 
        if unicodedata.category(c)[0] != 'C' and c.isprintable() and c != '"'
    )

    # normalize whitespace
    text = re.sub(r"\s+", " ", text)

    # remove duplicate consecutive words
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

    # keep only ASCII characters (English text only)
    text = text.encode("ascii", "ignore").decode()

    return text.strip()

# Apply to the DataFrame
df_game['comment'] = df_game['comment'].apply(normalize_text)

In [30]:
df_game

Unnamed: 0,comment
278,"no, galio stacks magic resistance but ryze dea..."
508,yea cause kobbe was so much better
948,"you know, if i were a pro, i'd practice hours ..."
985,you're probably not druid master race.
1090,"oh, well in that case... make fun of him all y..."
...,...
1010475,"yeah, but you had to spend all your money to u..."
1010525,mario gets sent to hell for being a godless he...
1010611,wow i bet that was a culture shock when he moved
1010723,that could be.. we all know that it's impossib...
