In [6]:
%pip install emoji

Collecting emoji
  Using cached emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Using cached emoji-2.14.1-py3-none-any.whl (590 kB)
Installing collected packages: emoji
Successfully installed emoji-2.14.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import re
import emoji
import unicodedata

In [2]:
chunk1 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk1.csv')
chunk2 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk2.csv')
chunk3 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk3.csv')
chunk4 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk4.csv')
chunk5 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk5.csv')
chunk6 = pd.read_csv('raw_data/train-balanced-sarcasm_chunk6.csv')

df = pd.concat([chunk1, chunk2, chunk3, chunk4, chunk5, chunk6], ignore_index=True)


Filter only sarcasm comment and long text

In [3]:
df = df[df['label'] == 1]
df = df[df['comment'].str.len() > 30]

Filter game comments

In [4]:
game_subreddits = [
    'gaming', 'pcgaming', 'games', 'leagueoflegends', 'Overwatch',
    'GlobalOffensive', 'FortNiteBR', 'PS4', 'xboxone', 'wow', 'nintendo', 'Minecraft'
]

df_game = df[df['subreddit'].isin(game_subreddits)]

Select only ```comment``` column

In [5]:
df_game = df_game[['comment']]

Handle duplicates

In [6]:
df_game.duplicated().sum()

27

In [7]:
df_game = df_game.drop_duplicates()

Check missing

In [8]:
df_game.isna().sum()

comment    0
dtype: int64

Normalize text

In [9]:
def normalize_text(text):
    text = text.lower()  # lowercase

    # remove 'early access review' at the beginning
    text = re.sub(r"^(early access review[\s:\-–—)]*)", "", text, flags=re.IGNORECASE)

    # remove URLs, mentions, hashtags
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)

    # remove numbers
    #text = re.sub(r"\d+", "", text)

    # remove emojis and icons
    text = emoji.replace_emoji(text, replace='')

    # Remove spam patterns: no more than 3 consecutive identical characters
    # e.g., "goooood" → "good", "aaaaawesome" → "awesome"
    text = re.sub(r'(.)\1{3,}', r'\1\1\1', text)  # Keep up to 3 repeats
    
    # remove non-printable or control characters + double quotes
    text = ''.join(
        c for c in text 
        if unicodedata.category(c)[0] != 'C' and c.isprintable() and c != '"'
    )

    # normalize whitespace
    text = re.sub(r"\s+", " ", text)

    # remove duplicate consecutive words
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

    # keep only ASCII characters (English text only)
    text = text.encode("ascii", "ignore").decode()

    return text.strip()

# Apply to the DataFrame
df_game['comment'] = df_game['comment'].apply(normalize_text)

In [10]:
df_game
df_game.to_csv('sarcasm_raw.csv')

In [1]:
from openai import OpenAI
import os
import pandas as pd
import numpy as np
import json
from dotenv import load_dotenv
import csv


Label comments

In [2]:
load_dotenv()


df = pd.read_csv("sarcasm_raw.csv")
results = []

if os.path.exists("sarcasm_labeled.csv"):
    dups = pd.read_csv("sarcasm_labeled.csv")
    for _, row in dups.iterrows():
        results.append(row)
else:
    with open("sarcasm_labeled.csv", "w",newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['comment', 'user_suggestion'])

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("api_key"))


# Process unprocessed rows in batches
for idx, row in df.iterrows():
    if idx < len(results):
        continue

    prompt = (
        'Your role is do sentiment analysis on sarcastic comments from social medias on topic relating to video games.\n'

        'If the comment is positive, return the number "1"\n'

        'If the comment is negative, return the number "0"\n'

        'Return only the sentiment result and nothing else.\n'

        'Here is the comment:\n'

        f'{row["comment"]}'
    )

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )


    #Extract response and parse JSON
    try:
        response_data = int(completion.choices[0].message.content)
        print(idx,response_data)

        with open("sarcasm_labeled.csv", "a",newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([row['comment'], response_data])
    except:
        print("Error. Skipping.")

print("Processing complete! Only new rows were added.")


Error. Skipping.
19954 0
19955 0
19956 0
19957 0
19958 0
19959 0
19960 0
19961 0
19962 0
19963 0
19964 0
19965 0
19966 0
19967 0
19968 0
19969 0
19970 0
19971 0
19972 0
19973 0
19974 0
19975 0
19976 0
19977 0
19978 0
19979 0
19980 0
19981 0
19982 0
19983 0
19984 0
19985 0
19986 0
19987 0
19988 0
19989 1
19990 0
19991 0
19992 0
19993 0
19994 0
19995 0
19996 0
19997 1
19998 0
19999 0
20000 0
20001 0
20002 0
20003 0
20004 0
20005 0
20006 0
20007 0
20008 0
20009 1
20010 1
20011 0
20012 0
20013 0
20014 0
20015 0
20016 0
20017 0
20018 0
20019 0
20020 0
20021 1
20022 0
20023 0
20024 0
20025 0
20026 0
20027 0
20028 0
20029 0
20030 0
20031 0
20032 0
20033 0
20034 0
20035 0
20036 0
20037 0
20038 0
20039 0
20040 0
20041 0
20042 0
20043 0
20044 0
20045 0
20046 0
20047 0
20048 0
20049 0
20050 0
20051 0
20052 0
20053 0
20054 0
20055 0
20056 0
20057 0
20058 0
20059 0
20060 0
20061 0
20062 0
20063 0
20064 0
20065 0
20066 0
20067 0
20068 0
20069 1
20070 0
20071 0
20072 0
20073 0
20074 0
20075 1
20076 0

In [4]:
df = pd.read_csv("sarcasm_labeled.csv")
df = df[df['user_suggestion'] == 1]
print(df.shape[0])

994
