# Notebook

In [139]:
import pandas as pd
import numpy as np
import os
import re

## Data import

In [140]:
dataset = pd.read_csv('indonesian_chat.csv')

In [141]:
dataset.head()

Unnamed: 0,id,chat,label
0,1,main mu kek tai cok,violence
1,2,user telat ngasih tau elu edan sarap gue berga...,violence
2,3,kadang berfikir percaya tuhan jatuh berkalikal...,neutral
3,4,user user aku\n\nku tau matamu sipit diliat,racist
4,5,capek deh ketemu kaum cina kapir gini match,racist


In [142]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10702 entries, 0 to 10701
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10702 non-null  int64 
 1   chat    10702 non-null  object
 2   label   10702 non-null  object
dtypes: int64(1), object(2)
memory usage: 251.0+ KB


## Data Cleaning (WIP)

In [143]:
dataset.drop("id", axis=1, inplace=True)

remove \n and \x__ from chat

In [144]:
dataset["chat"] = dataset["chat"].str.replace(r"\\n", "", regex=True)
dataset["chat"] = dataset["chat"].str.replace(r"\\x[0-9a-fA-F]{2}", "", regex=True)

kata "user" juga sering muncul tanpa arti yang jelas, mungkin pada awalnya merepresentasikan username orang yang dituju, sehingga di-drop karena kami anggap noise

In [145]:
dataset["chat"] = dataset["chat"].str.replace("user", "", regex=False)

# Label Encode Target

In [146]:
from sklearn.preprocessing import LabelEncoder

In [147]:
le = LabelEncoder()
dataset['labels'] = le.fit_transform(dataset['label'])
dataset.drop("label", axis=1, inplace=True)

regex stuff

## BERT Import

In [148]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [149]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("indolem/indobert-base-uncased",num_labels=len(le.classes_))  # Number of unique labels

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Preproc

In [150]:
from sklearn.model_selection import train_test_split

In [151]:
df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=42)

In [152]:
df_train.size, df_test.size

(17122, 4282)

In [153]:
df_train.head()

Unnamed: 0,chat,labels
2312,15 terimakasih selamat jalan ila jiwari robbih...,1
2516,askmf sedih gaksi orang2 ngeledekin doi gitu n...,3
3467,ape beneeeer dapet jadwal shift pagi 4 besok o...,1
8573,pantes yatim gak ngewe bareng loe,0
9814,congor nya kaum sumbu pendek,2


convert to huggingface dataset

In [154]:
from datasets import Dataset

In [155]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

## Tokenizer

In [172]:
def preprocess_text(data, is_train=True):
    result =  tokenizer(data["chat"],truncation=True, padding="max_length", max_length=512)
    if is_train:
        result["labels"] = data["labels"]
    return result

In [157]:
tokenized_train = train_dataset.map(preprocess_text, batched=True)

Map:   0%|          | 0/8561 [00:00<?, ? examples/s]

Map: 100%|██████████| 8561/8561 [00:02<00:00, 3812.35 examples/s]


In [158]:
tokenized_test = test_dataset.map(preprocess_text, batched=True)

Map: 100%|██████████| 2141/2141 [00:00<00:00, 4050.29 examples/s]


In [159]:
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

## Training

In [160]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

In [161]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [162]:
import evaluate

In [163]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [164]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics

)


  trainer = Trainer(


In [165]:
trainer.train()

Step,Training Loss
1071,0.9229
2142,0.5282
3213,0.3822
4284,0.2816


TrainOutput(global_step=4284, training_loss=0.528735170622611, metrics={'train_runtime': 6327.1708, 'train_samples_per_second': 5.412, 'train_steps_per_second': 0.677, 'total_flos': 9010136773607424.0, 'train_loss': 0.528735170622611, 'epoch': 4.0})

In [166]:
trainer.save_model('filter_model')

## Evaluation

In [178]:
for idx, class_name in enumerate(le.classes_):
    print(f"{idx}: {class_name}")

0: harassment
1: neutral
2: racist
3: violence


In [167]:
from sklearn.metrics import classification_report

In [168]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['labels'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2013
           1       0.97      0.91      0.94      2182
           2       0.97      0.98      0.97      2011
           3       0.93      0.96      0.94      2355

    accuracy                           0.96      8561
   macro avg       0.96      0.96      0.96      8561
weighted avg       0.96      0.96      0.96      8561



In [169]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_test['labels'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       471
           1       0.81      0.75      0.78       547
           2       0.85      0.90      0.88       495
           3       0.81      0.81      0.81       628

    accuracy                           0.84      2141
   macro avg       0.84      0.84      0.84      2141
weighted avg       0.83      0.84      0.83      2141



In [175]:
test_sentences = ["ini adalah contoh chat yang baik",
                  "woy goblok banget jadi orang",
                  "ngntot lu",
                  "iya deh yang sipit diem aj deh"]
custom_dataset = pd.DataFrame(test_sentences, columns=["chat"])
custom_hugg_sample = Dataset.from_pandas(custom_dataset)
custom_tokenized_sample = custom_hugg_sample.map(lambda data: preprocess_text(data, is_train=False), batched=True)
predictions = trainer.predict(custom_tokenized_sample)
predictions = np.argmax(predictions[0], axis=1)
for i, pred in enumerate(predictions):
    print(f"Chat: {test_sentences[i]}")
    print(f"Predicted Label: {le.inverse_transform([pred])[0]}")
    print()

Map: 100%|██████████| 4/4 [00:00<00:00, 613.18 examples/s]


Chat: ini adalah contoh chat yang baik
Predicted Label: neutral

Chat: woy goblok banget jadi orang
Predicted Label: violence

Chat: ngntot lu
Predicted Label: harassment

Chat: iya deh yang sipit diem aj deh
Predicted Label: racist

