In [1]:
import pandas as pd
import numpy as np
import operator
import re
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

tqdm.pandas()




In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import optuna
import random
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, log_loss
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.preprocessing import LabelEncoder

from concurrent.futures import ThreadPoolExecutor

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from pprint import pprint
import os

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

experiment_name = 'pytorch_bert'

In [3]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(f'Train shape: {df.shape}'), print(f'Test shape: {df_test.shape}')

Train shape: (19579, 3)
Test shape: (8392, 2)


(None, None)

In [4]:
TARGET = 'author'

In [5]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=5)

# Get the indices for the validation set
for _, test_indx in sss.split(df, df[TARGET]):
    valid_df = df.iloc[test_indx]
    train_df = df.drop(test_indx)

len(valid_df), len(train_df)

(1958, 17621)

In [6]:
%%time

# Set model and tokenizer for finetuning. Ensure num_labels matches the number of labels
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-large-uncased')
model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-large-uncased', num_labels=3)

Some weights of the model checkpoint at google-bert/bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-lar

CPU times: total: 3.98 s
Wall time: 10.9 s


In [7]:
# make df the same as the valid_df so I do not have to change too much code

df = valid_df.copy()

In [8]:
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['author'])  # Convert string labels to integers

df.sample()

Unnamed: 0,id,text,author,encoded_labels
12281,id14874,But the memory of past sorrow is it not presen...,EAP,0


In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
from tqdm import tqdm

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['encoded_labels']
        encoding = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len)
        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # Remove batch dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }


dataset = CustomDataset(df, tokenizer, max_len=512)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    # eval_strategy='epoch',
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    # warmup_steps=500,
    # weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    learning_rate=3e-4,
    save_steps=5000,
    dataloader_num_workers=4,
    seed=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
trainer.train()

***** Running training *****
  Num examples = 1762
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 84
  Number of trainable parameters = 335144963


  0%|          | 0/84 [00:00<?, ?it/s]

In [None]:
# Save the finetuned model for further use (rename as appropraite)

model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

In [None]:
# Use the pretrained model for embedding the training and test data

%%time

tokenizer = AutoTokenizer.from_pretrained('./trained_model')
model = AutoModel.from_pretrained('./trained_model')

In [None]:
def get_bert_embeddings(sentences):
    model.eval()  # Put the model in evaluation mode
    batch_size = 64  # Adjust based on your memory availability
    embeddings = []
    
    # Wrap the range generator with tqdm for a progress bar
    for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].detach().numpy())
    
    # Concatenate all batch embeddings
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

In [None]:
# Convert the text column in to a list

documents_train = df['text'].tolist()

In [None]:
%%time

bert_embeddings = get_bert_embeddings(documents_train)
bert_df_train = pd.DataFrame(bert_embeddings)
bert_df_train.columns = ['bert_' + str(col) for col in bert_df_train.columns]
bert_df_train.head()

In [None]:
# Save the NLP embedding into a CSV which can be used the training data

bert_df_train.to_csv('pytorch_bert_large_uncased_finetuned_valid.csv', index=False)