In [None]:
# Ad-Soyad:İrem ÇATAL
# Son Revize Tarihi: 15.02.2024

In [2]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [29]:
import numpy as np
import pandas as pd
import random
import time
import datetime
import torch
import transformers
import tensorflow as tf

from transformers import BertTokenizer
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm

In [30]:
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  details = tf.config.experimental.get_device_details(gpus[0])
  print("GPU details: ", details)


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'compute_capability': (8, 0), 'device_name': 'NVIDIA A100-SXM4-40GB'}


In [31]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP_task/topics.csv')

In [32]:
# Makine öğrenmesi için kategori isimlerini sayısal değerlere dönüştürdüm.
df['encoded_categories'] = LabelEncoder().fit_transform(df['category'])

In [33]:
df.head()

Unnamed: 0,category,text,encoded_categories
0,siyaset,3 milyon ile ön seçim vaadi mhp nin 10 olağan...,4
1,siyaset,mesut_yılmaz yüce_divan da ceza alabilirdi pr...,4
2,siyaset,disko lar kaldırılıyor başbakan_yardımcısı ar...,4
3,siyaset,sarıgül anayasa_mahkemesi ne gidiyor mustafa_...,4
4,siyaset,erdoğan idamın bir haklılık sebebi var demek ...,4


In [34]:
df['category'].value_counts()

siyaset       700
dunya         700
ekonomi       700
kultur        700
saglik        700
spor          700
teknoloji     700
Name: category, dtype: int64

In [35]:
df.sample(10)

Unnamed: 0,category,text,encoded_categories
2933,saglik,çocuklarda tiroid yetersizliği büyümeyi engel...,3
365,siyaset,erdoğan kararı bugün ! bu ayın sonunda görev ...,4
185,siyaset,izmir de kimse hayatından memnun değil yerel ...,4
241,siyaset,erdoğan perinçek ten tazminat kazandı başbaka...,4
140,siyaset,bakanlıktan recep_akdağ yalanlaması sağlık_ba...,4
1220,dunya,abd ve ingiltere afganistan ı tekrar iç savaş...,0
4692,teknoloji,twitter kullanıcıları tehlike altında ! twitt...,6
3083,saglik,sigara beyni çürütüyor sigara tüketiminin bey...,3
1510,ekonomi,komşu türk turist çekmek için camileri bile o...,1
4810,teknoloji,android kuşatma altında ! android kuşatma alt...,6


In [36]:
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', do_lower_case=True)
sentences = df.text.values
max_len = 250

In [37]:
# Veri setini %80 eğitim ve %20 test olarak ayırdım
training = df.groupby('category').apply(lambda x : x.sample(frac = 0.8))
test = pd.concat([df,training]).drop_duplicates(keep=False)

print("Training: ", len(training))
print("Test: ", len(test))

training_texts = training.text.values
training_labels = training.encoded_categories.values

Training:  3920
Test:  856


In [38]:
input_ids = []
attention_masks = []

for text in training_texts:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(training_labels)

print('Original: ', training_texts[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:   suriye de iki taraf da sonuna kadar savaşmakta kararlı rusya_dışişleri_bakanı_sergey_lavrov suriye de taraflardan hiç birinin savaşı durdurmak istemediğini herkesin sonuna kadar savaşmaya niyetli olduğunu söyledi rus basınına değerlendirmede bulunan lavrov suriye cumhurbaşkanı beşşar_esed görevi bırakmayı reddediyor o na kim ne söylerse söylesin o ayrılmayacak o ikna edilemez batılı liderler ve arap komşularından gelen tehditlerin de farkında … muhalefet gibi o da kazanıncaya kadar savaşmakta kararlı … uyarısında bulundu batı muhalefeti savaşa devam etmesi için teşvik ediyor eleştirisi yapan lavrov esed in herhangi bir zafer olmasa da kazanıncaya kadar savaşmakta ısrarcı olacağını söyledi bm_güvenlik_konseyi nin suriye özel temsilcisi lakhdar ibrahimi yi dinlediğini hatırlatan lavrov ibrahimi herhangi bir zafer olacağına inanmıyor bir yıpratma savaşı sürüyor insanların kültürel değerlerin tarihi anıtların hatta unesco nun korumasındaki halep in yıkıldığı bir savaş bu çok kö

In [39]:
train_dataset = TensorDataset(input_ids, attention_masks, labels)

batch_size = 32

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

number_of_categories = len(df['encoded_categories'].unique())

model = BertForSequenceClassification.from_pretrained(
    "dbmdz/bert-base-turkish-128k-uncased",
    num_labels = number_of_categories,
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(128000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [40]:
epochs = 4

optimizer = AdamW(model.parameters(),
                  lr = 5e-5,
                  eps = 1e-8
                )

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)



In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(device)


cuda


In [42]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

seed_val = 1903

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()

for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 10 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        output = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels)
        loss = output['loss']
        logits = output['logits']
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print("Average training loss: {0:.2f}".format(avg_train_loss))
    print("Training epoch took: {:}".format(training_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Training Time': training_time,
        }
    )

print("Training completed in {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

Batch    10  of    123.    Elapsed: 0:00:03.
Batch    20  of    123.    Elapsed: 0:00:06.
Batch    30  of    123.    Elapsed: 0:00:09.
Batch    40  of    123.    Elapsed: 0:00:12.
Batch    50  of    123.    Elapsed: 0:00:15.
Batch    60  of    123.    Elapsed: 0:00:18.
Batch    70  of    123.    Elapsed: 0:00:21.
Batch    80  of    123.    Elapsed: 0:00:24.
Batch    90  of    123.    Elapsed: 0:00:28.
Batch   100  of    123.    Elapsed: 0:00:31.
Batch   110  of    123.    Elapsed: 0:00:34.
Batch   120  of    123.    Elapsed: 0:00:37.
Average training loss: 0.52
Training epoch took: 0:00:37
Batch    10  of    123.    Elapsed: 0:00:03.
Batch    20  of    123.    Elapsed: 0:00:06.
Batch    30  of    123.    Elapsed: 0:00:09.
Batch    40  of    123.    Elapsed: 0:00:12.
Batch    50  of    123.    Elapsed: 0:00:15.
Batch    60  of    123.    Elapsed: 0:00:18.
Batch    70  of    123.    Elapsed: 0:00:21.
Batch    80  of    123.    Elapsed: 0:00:24.
Batch    90  of    123.    Elapsed: 0:00:27

In [45]:
test_texts = test.text.values
test_labels = test.encoded_categories.values

input_ids = []
attention_masks = []

for text in test_texts:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(test_labels)

batch_size = 32

prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [46]:
print('Prediction started on test data')
model.eval()
predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  predictions.append(logits)
  true_labels.append(label_ids)

print('Prediction completed')

prediction_set = []

for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  prediction_set.append(pred_labels_i)

prediction_scores = [item for sublist in prediction_set for item in sublist]

Prediction started on test data
Prediction completed


In [47]:
f_score = f1_score(test_labels, prediction_scores, average='macro')
precision = precision_score(test_labels, prediction_scores, average='macro')
recall = recall_score(test_labels, prediction_scores, average='macro')

print("F-Score: ", f_score)
print("Recall: ", recall)
print("Precision: ", precision)

report = pd.DataFrame(classification_report(test_labels, prediction_scores, output_dict=True))
report = report.rename(columns={'0':'dunya',
                          '1':'ekonomi',
                          '2':'kultur',
                          '3':'saglik',
                          '4':'siyaset',
                          '5':'spor',
                          '6':'teknoloji'})

print(report)

F-Score:  0.9289798318371825
Recall:  0.9301674101053105
Precision:  0.9281191065560446
                dunya     ekonomi     kultur      saglik     siyaset  \
precision    0.929688    0.890511   0.910000    0.954955    0.918519   
recall       0.894737    0.890511   0.957895    0.946429    0.911765   
f1-score     0.911877    0.890511   0.933333    0.950673    0.915129   
support    133.000000  137.000000  95.000000  112.000000  136.000000   

                 spor   teknoloji  accuracy   macro avg  weighted avg  
precision    0.975806    0.917355   0.92757    0.928119      0.927526  
recall       1.000000    0.909836   0.92757    0.930167      0.927570  
f1-score     0.987755    0.913580   0.92757    0.928980      0.927400  
support    121.000000  122.000000   0.92757  856.000000    856.000000  


In [48]:
# Modelin kaydedilmesi
torch.save(model.state_dict(), "/content/drive/My Drive/model.pth")