In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hate-speech-classification/CaseFolded_PunctRemoved_RTUserRemovedTest.csv
/kaggle/input/hate-speech-classification/CaseFolded_PunctRemoved_RTUserRemovedTrain.csv


In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
train_df = pd.read_csv('/kaggle/input/hate-speech-classification/CaseFolded_PunctRemoved_RTUserRemovedTrain.csv')
test_df = pd.read_csv('/kaggle/input/hate-speech-classification/CaseFolded_PunctRemoved_RTUserRemovedTest.csv')

tweets = train_df['Tweet'].tolist()
targets = train_df[['HS_Religion', 'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other', 
                    'HS_Individual', 'HS_Group', 'HS_Weak', 'HS_Moderate', 'HS_Strong']].values.tolist()

test_tweets = test_df['Tweet'].tolist()

In [4]:
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')
max_length = 128
lr = 2e-5
epochs = 20
batch_size = 32

In [28]:
# Pembuatan Dataset
class HateSpeechDataset(Dataset):
    def __init__(self, tweets, targets, tokenizer, max_length):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, index):
        tweet = self.tweets[index]
        target = self.targets[index]
        
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target': torch.tensor(target, dtype=torch.float)
        }

In [6]:
class HateSpeechClassifier(nn.Module):
    def __init__(self, num_classes):
        super(HateSpeechClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('indolem/indobert-base-uncased')
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)
        self.softmax = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)[0]
        pooled_output = outputs[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        
        linear_output1 = self.relu(self.fc1(pooled_output))
        linear_output2 = self.relu(self.fc2(linear_output1))
        linear_output3 = self.fc3(linear_output2)
        
        logits = self.softmax(linear_output3)
        
        return logits


In [7]:
train_tweets, val_tweets, train_targets, val_targets = train_test_split(tweets, targets, test_size=0.2, random_state=42)

In [8]:
train_dataset = HateSpeechDataset(train_tweets, train_targets, tokenizer, max_length)
val_dataset = HateSpeechDataset(val_tweets, val_targets, tokenizer, max_length)
test_dataset = HateSpeechDataset(test_tweets, [], tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [9]:
model = HateSpeechClassifier(num_classes=len(train_targets[0]))
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
criterion = nn.BCELoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion.to(device)
device

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


device(type='cuda')

In [10]:
def checkpoint(model, filename):
    torch.save(model.state_dict(), filename)
    
def resume(model, filename):
    model.load_state_dict(torch.load(filename))

In [11]:
from sklearn.metrics import accuracy_score

In [12]:
# Convert logits to corresponding prediction
def finalize_prediction(out):    
    # Handle category
    category = out[:5]
    for i in range(5):
        if category[i] > 0.5:
            out[i] = 1
        else:
            out[i] = 0
    
    # Handle individual/group target
    i = 5
    if out[i] >= out[i+1]:
        out[i] = 1
        out[i+1] = 0
    elif out[i] < out[i+1]:
        out[i] = 0
        out[i+1] = 1

    # Handle level of hate
    i = 7
    max = out[i]
    max_idx = i
    for j in range(1, 3):
        if out[i+j] > max:
            max = out[i+j]
            max_idx = i+j
    for i in range(7, 10):
        if i != max_idx:
            out[i] = 0
        else:
            out[i] = 1

In [13]:
best_accuracy = 0.0
for epoch in range(epochs):
    model.train()
    print(f"Epoch {epoch+1}: Training start")
    train_loss = 0.0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward(retain_graph=True)
        optimizer.step()
        
        train_loss += loss.item() * input_ids.size(0)
        
    train_loss /= len(train_dataloader.dataset)
    print(f"Epoch {epoch+1}: Training Done")
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)
            
            outputs = model(input_ids, attention_mask)
            for output in outputs:
                finalize_prediction(output)
            loss = criterion(outputs, targets)
            
            val_loss += loss.item() * input_ids.size(0)
            
            val_preds.extend(targets.cpu().detach().numpy().tolist())
            val_targets.extend(outputs.cpu().detach().numpy().tolist())
    acc = accuracy_score(val_targets, val_preds) * 100
    val_loss /= len(val_dataloader.dataset)
    
    print(f'Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}')
    print(f"Epoch {epoch+1}: validation accuracy = {acc:.2f}%")
    if acc > best_accuracy:
        best_accuracy = acc
        checkpoint(model, "best_model.pth")
    print(f"Epoch {epoch+1}: Validation end")

resume(model, "best_model.pth")

Epoch 1: Training start
Epoch 1: Training Done
Epoch 1/20 - Train Loss: 0.5108 - Val Loss: 21.7282
Epoch 1: validation accuracy = 44.28%
Epoch 1: Validation end
Epoch 2: Training start
Epoch 2: Training Done
Epoch 2/20 - Train Loss: 0.3974 - Val Loss: 16.7057
Epoch 2: validation accuracy = 45.81%
Epoch 2: Validation end
Epoch 3: Training start
Epoch 3: Training Done
Epoch 3/20 - Train Loss: 0.3442 - Val Loss: 15.9406
Epoch 3: validation accuracy = 45.54%
Epoch 3: Validation end
Epoch 4: Training start
Epoch 4: Training Done
Epoch 4/20 - Train Loss: 0.2951 - Val Loss: 12.8533
Epoch 4: validation accuracy = 56.71%
Epoch 4: Validation end
Epoch 5: Training start
Epoch 5: Training Done
Epoch 5/20 - Train Loss: 0.2455 - Val Loss: 12.2322
Epoch 5: validation accuracy = 57.07%
Epoch 5: Validation end
Epoch 6: Training start
Epoch 6: Training Done
Epoch 6/20 - Train Loss: 0.2002 - Val Loss: 12.5203
Epoch 6: validation accuracy = 57.70%
Epoch 6: Validation end
Epoch 7: Training start
Epoch 7: T

In [14]:
resume(model, "best_model.pth")

In [15]:
test_df.head()

Unnamed: 0,No,Tweet
0,Test-1,pemerintah sekarang pro asing sudah tidak bisa...
1,Test-2,cebong dungu picek sudah kalah malah gila
2,Test-3,namanya juga simpang susun bukan bundaran sema...
3,Test-4,yang tidak pakai jilbab komunis megawati sri m...
4,Test-5,ramos yang aku pandang idola dahulu sekarang s...


In [16]:
submission_sample = pd.read_csv('/kaggle/input/hate-speech-classification/sample.csv')
submission_sample.head()

Unnamed: 0,No,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,Test-1,0,0,0,0,0,0,0,0,0,0
1,Test-2,0,0,0,0,0,0,0,0,0,0
2,Test-3,0,0,0,0,0,0,0,0,0,0
3,Test-4,0,0,0,0,0,0,0,0,0,0
4,Test-5,0,0,0,0,0,0,0,0,0,0


In [46]:
# Extract dummy target from submission sample & concat it to cleaned training data
target_list = [
    'HS_Religion', 'HS_Race', 'HS_Physical', 'HS_Gender',
    'HS_Other', 'HS_Individual', 'HS_Group', 'HS_Weak', 
    'HS_Moderate', 'HS_Strong'
]

In [61]:
df_test = pd.read_csv('/kaggle/input/hate-speech-classification/CaseFolded_PunctRemoved_RTUserRemovedTest.csv')


In [48]:
df_test.head()

Unnamed: 0,No,Tweet
0,Test-1,pemerintah sekarang pro asing sudah tidak bisa...
1,Test-2,cebong dungu picek sudah kalah malah gila
2,Test-3,namanya juga simpang susun bukan bundaran sema...
3,Test-4,yang tidak pakai jilbab komunis megawati sri m...
4,Test-5,ramos yang aku pandang idola dahulu sekarang s...


In [49]:
dummy_target = pd.DataFrame(submission_sample[target_list].values, columns=target_list)
dummy_target

Unnamed: 0,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
145,0,0,0,0,0,0,0,0,0,0
146,0,0,0,0,0,0,0,0,0,0
147,0,0,0,0,0,0,0,0,0,0
148,0,0,0,0,0,0,0,0,0,0


In [60]:
df_test = pd.concat([df_test, dummy_target], axis=1)
df_test.head()

Unnamed: 0,No,Tweet,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,Test-1,pemerintah sekarang pro asing sudah tidak bisa...,0,0,0,0,0,0,0,0,0,0
1,Test-2,cebong dungu picek sudah kalah malah gila,0,0,0,0,0,0,0,0,0,0
2,Test-3,namanya juga simpang susun bukan bundaran sema...,0,0,0,0,0,0,0,0,0,0
3,Test-4,yang tidak pakai jilbab komunis megawati sri m...,0,0,0,0,0,0,0,0,0,0
4,Test-5,ramos yang aku pandang idola dahulu sekarang s...,0,0,0,0,0,0,0,0,0,0


In [51]:
df_test

Unnamed: 0,No,Tweet,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,Test-1,pemerintah sekarang pro asing sudah tidak bisa...,0,0,0,0,0,0,0,0,0,0
1,Test-2,cebong dungu picek sudah kalah malah gila,0,0,0,0,0,0,0,0,0,0
2,Test-3,namanya juga simpang susun bukan bundaran sema...,0,0,0,0,0,0,0,0,0,0
3,Test-4,yang tidak pakai jilbab komunis megawati sri m...,0,0,0,0,0,0,0,0,0,0
4,Test-5,ramos yang aku pandang idola dahulu sekarang s...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
145,Test-146,banyak kader kader partai demokrasi indonesia ...,0,0,0,0,0,0,0,0,0,0
146,Test-147,ternyata buddha pun teroris,0,0,0,0,0,0,0,0,0,0
147,Test-148,biasa mereka itulah para antek antek zionis da...,0,0,0,0,0,0,0,0,0,0
148,Test-149,astaga bolot banget dia,0,0,0,0,0,0,0,0,0,0


In [52]:
test_tweets = df_test['Tweet'].tolist()
test_dataset = HateSpeechDataset(test_tweets, df_test[target_list].values.tolist(), tokenizer, max_length)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [53]:
output_labels = []
with torch.no_grad():
    for batch_data in test_dataloader:
        # Unpack batch_data
        ids = batch_data['input_ids'].to(device)
        mask = batch_data['attention_mask'].to(device)
        
        # Saving output
        outputs = model(ids, mask)
        for output in outputs:
            finalize_prediction(output)
        output_labels.extend(outputs.cpu().detach().numpy().tolist())

In [54]:
model_preds = pd.DataFrame(output_labels, columns=target_list)
model_preds.head()

Unnamed: 0,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [55]:
model_preds.to_csv('prediction.csv')

In [62]:
df_test = pd.concat([df_test, model_preds], axis=1)
df_test.head()

Unnamed: 0,No,Tweet,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,Test-1,pemerintah sekarang pro asing sudah tidak bisa...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1,Test-2,cebong dungu picek sudah kalah malah gila,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,Test-3,namanya juga simpang susun bukan bundaran sema...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,Test-4,yang tidak pakai jilbab komunis megawati sri m...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,Test-5,ramos yang aku pandang idola dahulu sekarang s...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [65]:
df_test = df_test.drop("Tweet", axis=1)
df_test.head()

Unnamed: 0,No,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,Test-1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1,Test-2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,Test-3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,Test-4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,Test-5,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [66]:
df_test.to_csv('final_pred.csv', index=False)