code link: https://www.kaggle.com/code/neerajmohan/fine-tuning-bert-for-text-classification

In [1]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
import string

import torch
import gc
gc.collect()
torch.cuda.empty_cache()
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split

from transformers import BertForSequenceClassification, AdamW,BertTokenizer,get_linear_schedule_with_warmup

### Veri Okuma

In [2]:
df_dataset = pd.read_csv("../dataset/dataset_without_stopwords.csv").drop(columns=["baslik"])

##### [CLS] ve [SEP] tokenlarının eklenmesi

In [4]:
df_dataset["sarki_sozu"] = ["[CLS]" + lyrics.replace("\n", "[SEP]") for lyrics in df_dataset.loc[:,"sarki_sozu"]]

In [6]:
lyrics = df_dataset.loc[:,"sarki_sozu"].values
labels = df_dataset.loc[:,"sarki_turu"].values

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# Submission

In [8]:
model = torch.load('../models/bert_model')

In [9]:
df_test = df_dataset.copy()
test_lyrics = df_test['sarki_sozu'].values

In [10]:
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased', do_lower_case=True)

In [11]:
test_input_ids = []
test_attention_masks = []
for lyrics in test_lyrics:
    encoded_dict = tokenizer.encode_plus(
                        lyrics,                     
                        add_special_tokens = True, 
                        max_length = 512,           
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = 2 # Evaluate with this batch size.
        )

In [13]:
vector_preds = []
predictions = []
for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            pred_flat = np.argmax(logits, axis=1).flatten()
            
            predictions.extend(list(pred_flat))
            temp_list = [logit.tolist() for logit in logits]
            vector_preds.extend(temp_list)

In [14]:
vector_preds[:10]

[[6.865117073059082,
  -0.23348672688007355,
  -1.5631963014602661,
  -1.5083428621292114,
  -1.3236541748046875,
  -1.542231798171997],
 [6.854511737823486,
  -0.13916641473770142,
  -1.474483847618103,
  -1.4636982679367065,
  -1.3609213829040527,
  -1.610752820968628],
 [6.86132287979126,
  -0.2804540991783142,
  -1.6744719743728638,
  -1.4116134643554688,
  -1.2205173969268799,
  -1.5300652980804443],
 [6.235579967498779,
  -0.044917602092027664,
  -1.9454907178878784,
  -0.13787297904491425,
  -1.3412141799926758,
  -1.8223973512649536],
 [6.606011867523193,
  0.421371191740036,
  -1.6814998388290405,
  -1.2813637256622314,
  -1.5418347120285034,
  -1.91470468044281],
 [0.4375429153442383,
  -0.10549703240394592,
  -1.715102195739746,
  4.687652111053467,
  -1.3462865352630615,
  -2.453789234161377],
 [6.765493869781494,
  -0.7332380414009094,
  -0.5144410133361816,
  -1.2446964979171753,
  -1.37889564037323,
  -1.6764421463012695],
 [6.8312225341796875,
  -0.6773218512535095,
  -

In [15]:
df_output = pd.DataFrame()

df_output['sarki_turu'] =predictions
df_output.to_csv('../submissions/submission.csv',index=True)