# Data Preprocessing

In [1]:
import pandas as pd
import sqlite3

In [2]:
conn = sqlite3.connect("./news_data_2.db")

In [3]:
sql = "select x.*, y.content from (select * from news_2 where rubrik2 not null) x inner join extracted_text y on x.url=y.url"

In [4]:
df = pd.read_sql(sql, con=conn)

In [5]:
df2 = df[['rubrik2', 'content']].copy()

In [6]:
df2['content'] = df2['content'].str.replace(r'^.*?jpnn\.com', '', regex=True)
df2['content'] = df2['content'].str.replace(r'[^\w.,\s]', '', regex=True)
df2['content'] = df2['content'].str.replace(r'\b\w*jpnn\w*\b', ' ', regex=True)
df2['content'] = df2['content'].str.replace(r' , ', ' ', regex=True)
df2['content'] = df2['content'].str.replace(r'\s+', ' ', regex=True)
df2['content'] = df2['content'].str.lower()

In [7]:
df2.columns = ["label", 'text']

In [8]:
df2.value_counts('label')

label
POLITIK                  6543
SOSIAL DAN BUDAYA        6045
OLAHRAGA                 3226
EKONOMI DAN BISNIS       2713
KRIMINAL                 2047
TEKNOLOGI DAN INOVASI     421
PENDIDIKAN                175
Name: count, dtype: int64

In [9]:
label_code = {
    "POLITIK": 0,
    "SOSIAL DAN BUDAYA": 1,
    "OLAHRAGA": 2,
    "EKONOMI DAN BISNIS": 3,
    "KRIMINAL": 4,
    "TEKNOLOGI DAN INOVASI": 5,
    "PENDIDIKAN": 6
}

def label_encoding(x):
    return label_code[x]

In [10]:
label_encoding("TEKNOLOGI DAN INOVASI")

5

In [11]:
df2['label'] = df2['label'].apply(label_encoding)

In [12]:
df2

Unnamed: 0,label,text
0,1,jakarta bupati tapanuli utara dr. drs. nikson...
1,0,jakarta pendukung dan pemilih pasangan prabow...
2,2,tim voli jakarta pertamina siap mengarungi ko...
3,5,jakarta produkproduk soundcore seperti open e...
4,2,jakarta persija jakarta berhasil memenangkan ...
...,...,...
21165,2,london arsenal harus mengubur mimpi bertakhta...
21166,3,kupang kabid propam polda nusa tenggara timur...
21167,1,jakarta kapolri jenderal listyo sigit prabowo...
21168,1,"manado badan meteorologi, klimatologi dan geo..."


# Data Modelling

In [13]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
num_labels = len(df2['label'].unique())

In [15]:
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2')



In [16]:
# model = AutoModel.from_pretrained("indobenchmark/indobert-base-p2", num_labels=7)

In [17]:
from datasets import Dataset

In [18]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df2, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [19]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)  # Sesuaikan `max_length` dengan kebutuhan

In [20]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████| 16936/16936 [00:17<00:00, 985.74 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 4234/4234 [00:03<00:00, 1071.22 examples/s]


In [21]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(d['input_ids']) for d in batch]),
        'attention_mask': torch.stack([torch.tensor(d['attention_mask']) for d in batch]),
        'labels': torch.tensor([d['label'] for d in batch])
    }

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)


In [22]:
model = AutoModelForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p2', num_labels=len(df2['label'].unique()))
# model.to('cuda' if torch.cuda.is_available() else 'cpu')
model.to('cpu')
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
num_epochs = 5

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')



KeyboardInterrupt

