In [1]:
import pandas as pd
import time
import requests
import json
import os
from dotenv import load_dotenv
from googleapiclient import discovery
from sklearn.metrics import confusion_matrix

In [2]:
#read in data
df = pd.read_excel('manual_label_sample.xlsx')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation,original text
0,972545,1057760,americans hold beer oh wait ive finished drinking,2022-01-24 05:25:27,MrFantasticallyNerdy,/r/SingaporeRaw/comments/sao76t/we_are_spendin...,t3_sao76t,t3_sao76t,htzg3uf,t5_xnx04,"{'controversiality': 0, 'collapsed_reason_code...",Americans: Hold my beer. Oh wait. I've already...
1,13347,14500,boss relies heavily pass holders start asking ...,2022-05-06 10:50:15,SimpleReadingSG90,/r/SingaporeRaw/comments/ujhtgu/rants_of_a_19y...,t3_ujhtgu,t1_i7j8gnf,i7jextx,t5_xnx04,"{'controversiality': 0, 'collapsed_reason_code...",And the boss who relies heavily on S Pass hold...
2,461901,502383,mean youre paid much wipe youre tears money ea...,2022-05-08 05:02:44,OldFee3150,/r/SingaporeRaw/comments/ujddhe/chan_chun_sing...,t3_ujddhe,t3_ujddhe,i7rioq7,t5_xnx04,"{'controversiality': 0, 'collapsed_reason_code...",I mean if you’re being paid that much should j...
3,1094356,1190080,lol idea hes singing singapore practice sang w...,2022-09-27 06:09:30,kopisiutaidaily,/r/SingaporeRaw/comments/xonblv/communist_taxi...,t3_xonblv,t3_xonblv,iq2n4xp,t5_xnx04,"{'controversiality': 0, 'collapsed_reason_code...",Lol he has no idea what he’s singing. If Singa...
4,1153986,1254859,would probably change sgkl railway line opens,2023-08-09 04:33:32,TaylorFritz,/r/SingaporeRaw/comments/15m2a52/why_do_singap...,t3_15m2a52,t1_jveg4qj,jveglrl,t5_xnx04,"{'banned_at_utc': None, 'mod_reason_by': None,...",That would probably change once the SG-KL rail...


In [3]:
#Rule-Based Labelling
singlish_toxic_dict = ['ahbeng', 'ahlian', 'baka', 'bloody hell', 'bloody idiot', 'bodoh', 'bo liao',
                       'buay pai seh', 'buay tahan', 'cb', 'cb kia', 'cb knn', 'cb', 'cb lao jia', 'cb lao knn',
                       'cb lor', 'cb sia', 'cb sia kia', 'ccb', 'chbye kia', 'chao chbye', 'chao chee bye', 'chow chibai',
                       'chow kar', 'chow tu lan', 'cibai', 'dumb ass', 'dumb', 'fuck', 'fuck you', 'fking', 'fucker',
                       'fucker sia', 'gila babi', 'gundu', 'hao lian kia', 'hopeless', 'idiot', 'idiot', 'ji bai', 'jiat lat', 
                       'jialat kia', 'jibai', 'joker', 'kan', 'kan ni na', 'kena sai', 'kia si lang', 'knn', 'knn cb kia', 'knnccb', 
                       'knnbccb', 'kns', 'kns cb', 'lampa', 'lan pa', 'lanjiao', 'lanjiao kia', 'lj', 'loser', 'nabei', 'no use kia', 
                       'noob', 'pok gai', 'pui', 'sabo kia', 'sibei jialat', 'sibei sian', 'si gina', 'siol', 'slut', 'siao lang', 'stupid', 
                       'suck', 'sua gu', 'tmd', 'tiok knn', 'tiok tiam', 'useless', 'what knn', 'what the fuck', 'wtf', 'wu liao kia', 'you die ah', 'you die']

#Function for checking test against toxic dictionary
def check_toxic(text):
    for word in singlish_toxic_dict:
        if word in text:
            return 'toxic'
    return None

#Store the result
df['result'] = df['text'].apply(check_toxic)

In [4]:
load_dotenv()
API_KEY = os.getenv("PERSPECTIVE_API_KEY")

In [5]:
# API call for TOXICITY and IDENTITY_SCORE scores
def get_toxicity(text):
    url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={API_KEY}"

    # Data to send to the API
    data = {
        'comment': {'text': text},
        'languages': ['en'],
        'requestedAttributes': {
            'TOXICITY': {},
            'IDENTITY_ATTACK': {}
        }
    }

    # Send request to API
    response = requests.post(url, data=json.dumps(data), headers={'Content-Type': 'application/json'})

    # Check for success and return result
    if response.status_code == 200:
        result = response.json()
        toxicity_score = result['attributeScores']['TOXICITY']['summaryScore']['value']
        identity_attack_score = result['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value']
        return toxicity_score, identity_attack_score
    
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None
    
# To avoid hitting rate limits
def toxicity_delay(text):
    time.sleep(1)  
    return get_toxicity(text)

# Label toxicity
def label_toxicity(df, t):
    # Create a copy of the DataFrame within the function
    df_copy = df.copy()

    # Iterate over rows and label based on toxicity thresholds
    for index, row in df_copy.iterrows():
        if pd.isnull(row['result']):  # If result is None, check toxicity
            scores = toxicity_delay(row['text'])

            if scores is not None:
                toxic_score, identity_attack_score = scores

                # Set threshold for toxicity
                if toxic_score > t or identity_attack_score > t:
                    df_copy.at[index, 'result'] = 'toxic'
                else:
                    df_copy.at[index, 'result'] = 'non-toxic'
    
    return df_copy  # Return the modified copy


In [6]:
# Define thresholds
ticker = [0.2, 0.3, 0.4, 0.5]

# Dictionary to store results for each threshold
df_results = {t: label_toxicity(df, t) for t in ticker}

In [7]:
for t, df in df_results.items():
    filename = f"label_toxicity_results_{t}.xlsx"
    df.to_excel(filename, index=False)
    print(f"Saved: {filename}")

Saved: label_toxicity_results_0.2_test.xlsx
Saved: label_toxicity_results_0.3_test.xlsx
Saved: label_toxicity_results_0.4_test.xlsx
Saved: label_toxicity_results_0.5_test.xlsx


In [8]:
# Load the actual labels
actual_labels = pd.read_excel('manual_label_sample_0_1.xlsx')
actual_labels['manual'] = actual_labels['manual'].map({0: 'non-toxic', 1: 'toxic'})

# Initialize list to store results
results = []

# Iterate through the ticker values
for t in ticker:
    # Access the corresponding model labels DataFrame from df_results
    model_labels = df_results[t]  # DataFrame for the current threshold

    # Get the actual and predicted labels
    y_true = actual_labels['manual']
    y_pred = model_labels['result']

    # Calculate confusion matrix for 'non-toxic' vs 'toxic'
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=['non-toxic', 'toxic']).ravel()

    # Calculate recall
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    # Calculate precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0

    # Store the results in the list
    results.append({
        'threshold': t,
        'TP': tp,
        'FP': fp,
        'FN': fn,
        'TN': tn,
        'precision': precision,
        'recall': recall
    })


In [9]:
# Convert the list of results to a DataFrame
results_df = pd.DataFrame(results)

# Display the DataFrame
print(results_df)

   threshold  TP  FP  FN   TN  precision  recall
0        0.2  29  29  11  131   0.500000   0.725
1        0.3  24  15  16  145   0.615385   0.600
2        0.4  16   9  24  151   0.640000   0.400
3        0.5  11   5  29  155   0.687500   0.275
