# Notebook

In [76]:
import pandas as pd
import numpy as np
import os

## Data import

In [77]:
dataset = pd.read_csv('indonesian_chat.csv')

In [78]:
dataset.head()

Unnamed: 0,id,chat,label
0,1,main mu kek tai cok,violence
1,2,user telat ngasih tau elu edan sarap gue berga...,violence
2,3,kadang berfikir percaya tuhan jatuh berkalikal...,neutral
3,4,user user aku\n\nku tau matamu sipit diliat,racist
4,5,capek deh ketemu kaum cina kapir gini match,racist


In [79]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10702 entries, 0 to 10701
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10702 non-null  int64 
 1   chat    10702 non-null  object
 2   label   10702 non-null  object
dtypes: int64(1), object(2)
memory usage: 251.0+ KB


## Data Cleaning (WIP)

In [80]:
dataset.drop("id", axis=1, inplace=True)

# Label Encode Target

In [81]:
from sklearn.preprocessing import LabelEncoder

In [82]:
le = LabelEncoder()
dataset['labels'] = le.fit_transform(dataset['label'])
dataset.drop("label", axis=1, inplace=True)

regex stuff

## BERT Import

In [83]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [84]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("indolem/indobert-base-uncased",num_labels=len(le.classes_))  # Number of unique labels

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Preproc

In [85]:
from sklearn.model_selection import train_test_split

In [86]:
df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=42)

In [87]:
df_train.size, df_test.size

(17122, 4282)

In [88]:
df_train.head()

Unnamed: 0,chat,labels
2312,15 terimakasih selamat jalan ila jiwari robbih...,1
2516,askmf sedih gaksi orang2 ngeledekin doi gitu n...,3
3467,ape beneeeer dapet jadwal shift pagi 4 besok o...,1
8573,pantes yatim gak ngewe bareng loe,0
9814,congor nya kaum sumbu pendek,2


convert to huggingface dataset

In [89]:
from datasets import Dataset

In [90]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

## Tokenizer

In [91]:
def preprocess_text(data):
    return tokenizer(data["chat"],truncation=True, padding="max_length", max_length=512)

In [92]:
tokenized_train = train_dataset.map(preprocess_text, batched=True)

Map: 100%|██████████| 8561/8561 [00:02<00:00, 3888.71 examples/s]


In [93]:
tokenized_test = test_dataset.map(preprocess_text, batched=True)

Map:   0%|          | 0/2141 [00:00<?, ? examples/s]

Map: 100%|██████████| 2141/2141 [00:00<00:00, 3956.61 examples/s]


## Training

In [94]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

In [95]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [96]:
import evaluate

In [97]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [98]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics

)


  trainer = Trainer(


In [99]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [None]:
trainer.save_model('filter_model')

## Evaluation

In [None]:
from sklearn.metrics import classification_report

In [None]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['labels'].tolist()
print(classification_report(GT,preds))

In [None]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_test['labels'].tolist()
print(classification_report(GT,preds))