# Notebook

In [1]:
import pandas as pd
import numpy as np
import os

## Data import

In [2]:
dataset = pd.read_csv('indonesian_chat.csv')

In [3]:
dataset.head()

Unnamed: 0,id,chat,label
0,1,main mu kek tai cok,violence
1,2,user telat ngasih tau elu edan sarap gue berga...,violence
2,3,kadang berfikir percaya tuhan jatuh berkalikal...,neutral
3,4,user user aku\n\nku tau matamu sipit diliat,racist
4,5,capek deh ketemu kaum cina kapir gini match,racist


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10702 entries, 0 to 10701
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10702 non-null  int64 
 1   chat    10702 non-null  object
 2   label   10702 non-null  object
dtypes: int64(1), object(2)
memory usage: 251.0+ KB


## Data Cleaning (WIP)

In [5]:
dataset.drop("id", axis=1, inplace=True)

# Label Encode Target

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
le = LabelEncoder()
dataset['labels'] = le.fit_transform(dataset['label'])
dataset.drop("label", axis=1, inplace=True)

regex stuff

## BERT Import

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [50]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("indolem/indobert-base-uncased",num_labels=len(le.classes_))  # Number of unique labels

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Preproc

In [10]:
from sklearn.model_selection import train_test_split

In [51]:
df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=42)

In [52]:
df_train.size, df_test.size

(17122, 4282)

In [53]:
df_train.head()

Unnamed: 0,chat,labels
2312,15 terimakasih selamat jalan ila jiwari robbih...,1
2516,askmf sedih gaksi orang2 ngeledekin doi gitu n...,3
3467,ape beneeeer dapet jadwal shift pagi 4 besok o...,1
8573,pantes yatim gak ngewe bareng loe,0
9814,congor nya kaum sumbu pendek,2


convert to huggingface dataset

In [54]:
from datasets import Dataset

In [55]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

## Tokenizer

In [None]:
def preprocess_text(data):
    result =  tokenizer(data["chat"],truncation=True, padding="max_length", max_length=512)
    result["labels"] = data["labels"]
    return result

In [57]:
tokenized_train = train_dataset.map(preprocess_text, batched=True)

Map: 100%|██████████| 8561/8561 [00:02<00:00, 4152.89 examples/s]


In [58]:
tokenized_test = test_dataset.map(preprocess_text, batched=True)

Map: 100%|██████████| 2141/2141 [00:00<00:00, 4065.62 examples/s]


In [61]:
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

## Training

In [62]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

In [63]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [64]:
import evaluate

In [65]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [69]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics

)


  trainer = Trainer(


In [70]:
trainer.train()

Step,Training Loss
1071,1.392
2142,1.3916


KeyboardInterrupt: 

In [25]:
trainer.save_model('filter_model')

## Evaluation

In [26]:
from sklearn.metrics import classification_report

In [27]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['labels'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2013
           1       0.00      0.00      0.00      2182
           2       0.00      0.00      0.00      2011
           3       0.28      1.00      0.43      2355

    accuracy                           0.28      8561
   macro avg       0.07      0.25      0.11      8561
weighted avg       0.08      0.28      0.12      8561



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_test['labels'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       471
           1       0.00      0.00      0.00       547
           2       0.00      0.00      0.00       495
           3       0.29      1.00      0.45       628

    accuracy                           0.29      2141
   macro avg       0.07      0.25      0.11      2141
weighted avg       0.09      0.29      0.13      2141



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
sample_dataset = dataset[:10]
hugg_sample = Dataset.from_pandas(sample_dataset)
tokenized_sample = hugg_sample.map(preprocess_text, batched=True)
predictions = trainer.predict(tokenized_sample)
predictions = np.argmax(predictions[0], axis=1)
for i, pred in enumerate(predictions):
    print(f"Chat: {sample_dataset['chat'].iloc[i]}")
    print(f"Predicted Label: {le.inverse_transform([pred])[0]}")
    print()

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map: 100%|██████████| 10/10 [00:00<00:00, 1249.76 examples/s]


Chat: main mu kek tai cok
Predicted Label: neutral

Chat: user telat ngasih tau elu edan sarap gue bergaul elu
Predicted Label: neutral

Chat: kadang berfikir percaya tuhan jatuh berkalikali kadang tuhan ninggalkan orangtuaku berencana berpisah kakakku memilih kristen
Predicted Label: neutral

Chat: user user aku\n\nku tau matamu sipit diliat
Predicted Label: neutral

Chat: capek deh ketemu kaum cina kapir gini match
Predicted Label: neutral

Chat: skill cacad ngide jungler cok
Predicted Label: neutral

Chat: gg main lo keren bro
Predicted Label: neutral

Chat: gue aja kelar rewatch aldnoah zero kampret emang endingnya 2 karakter utama cowonya kena friendzone bray xd url
Predicted Label: neutral

Chat: admin belanja port terbaik nak makan ai kepal milo ai kepal horlicks cendol toping kaw doket mano tu
Predicted Label: neutral

Chat: user enak lg klo smbil ngewe
Predicted Label: neutral

