In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import math
from google.colab import drive

In [None]:
# List of models you want to use
model_name = 'unitary/toxic-bert'

In [None]:
# Function to dynamically split text into equally sized chunks and sum the toxicity scores
def classify_toxicity_by_dynamic_chunks(text, tokenizer, classifier, max_length=512):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=False)

    # Get total number of tokens
    input_ids = inputs['input_ids'][0]
    total_tokens = len(input_ids)

    # Calculate chunk size based on the total number of tokens
    if total_tokens <= max_length:
        num_chunks = 1
        chunk_size = total_tokens
    else:
        num_chunks = math.ceil(total_tokens / max_length)
        chunk_size = math.ceil(total_tokens / num_chunks)
        # chunk_size = max_length

    # Initialize list to store toxicity scores
    toxicity_scores = []

    # Iterate through each dynamically sized chunk
    for i in range(num_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, total_tokens)

        chunk = tokenizer.decode(input_ids[start:end], skip_special_tokens=True)

        # Get the toxicity scores for all labels in the chunk
        result = classifier(chunk)

        # Sum scores from all labels
        score_sum = sum(res['score'] for res in result[0])  # Summing all label scores
        toxicity_scores.append(score_sum)

    # Return the average of the summed scores across all chunks
    return sum(toxicity_scores) / len(toxicity_scores) if toxicity_scores else 0

In [None]:
# Function to classify toxicity scores into 'toxic' or 'non-toxic' labels
def classify_toxicity_score(score, threshold):
    return "toxic" if score > threshold else "non-toxic"

In [None]:
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/Colab Notebooks/cleaned_data_2223.csv'
df = pd.read_csv(file_path)

In [None]:
# Part 1: Running on sample.xlsx (with comment matching)å
start_index=0
# end_index=1600000
df_sample = df[start_index:]
df_sample['predicted_label'] = None
df_sample

In [None]:
singlish_toxic_dict = ['ahbeng', 'ahlian', 'baka', 'bloody hell', 'bloody idiot', 'bodoh', 'bo liao',
                       'buay pai seh', 'buay tahan', 'cb', 'cb kia', 'cb knn', 'cb', 'cb lao jia', 'cb lao knn',
                       'cb lor', 'cb sia', 'cb sia kia', 'ccb', 'chbye kia', 'chao chbye', 'chao chee bye', 'chow chibai',
                       'chow kar', 'chow tu lan', 'cibai', 'dumb ass', 'dumb', 'fuck', 'fuck you', 'fking', 'fucker',
                       'fucker sia', 'gila babi', 'gundu', 'hao lian kia', 'hopeless', 'idiot', 'idiot', 'ji bai', 'jiat lat',
                       'jialat kia', 'jibai', 'joker', 'kan', 'kan ni na', 'kena sai', 'kia si lang', 'knn', 'knn cb kia', 'knnccb',
                       'knnbccb', 'kns', 'kns cb', 'lampa', 'lan pa', 'lanjiao', 'lanjiao kia', 'lj', 'loser', 'nabei', 'no use kia',
                       'noob', 'pok gai', 'pui', 'sabo kia', 'sibei jialat', 'sibei sian', 'si gina', 'siol', 'slut', 'siao lang', 'stupid',
                       'suck', 'sua gu', 'tmd', 'tiok knn', 'tiok tiam', 'useless', 'what knn', 'what the fuck', 'wtf', 'wu liao kia', 'you die ah', 'you die']

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create the pipeline
classifier = pipeline('text-classification', model=model_name, top_k=None, device=0, batch_size=8)

labels = []
# Loop through each row in the dataframe
for index, row in df_sample.iterrows():
    text = row['text']

    if any(word in text for word in singlish_toxic_dict):
        predicted_label = "toxic"

    else:
        # Get toxicity score for the text
        toxicity_score = classify_toxicity_by_dynamic_chunks(text, tokenizer, classifier)

        # Classify the toxicity score based on the threshold
        predicted_label = classify_toxicity_score(toxicity_score, threshold=0.1)

    df_sample.at[index, 'predicted_label'] = predicted_label

# Save the updated DataFrame with labels to a new CSV file
output_file_path = '/content/drive/My Drive/Colab Notebooks/cleaned_data_2223_labelled' + str(start_index) + '-' + '.csv'
df_sample.to_csv(output_file_path)

In [None]:
df_temp = df_sample[(df_sample['predicted_label'] == 'toxic') | (df_sample['predicted_label'] == "non-toxic")]
df_temp
output_file_path = '/content/drive/My Drive/Colab Notebooks/cleaned_data_2223_labelled_0-308212.csv'
df_temp.to_csv(output_file_path)