In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, TFAutoModel
from tqdm import tqdm




In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.shape, test.shape

((19579, 3), (8392, 2))

In [3]:
TARGET = 'author'

In [5]:
# Get the number of unique lables in the training data target

train_num_labels = train[TARGET].nunique()
train_num_labels

3

In [None]:
%%time

# Set model and tokenizer for finetuning
# Ensure num_lables matches the number of labels

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-large-uncased', fast_tokenize=True)
model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-large-uncased', num_labels=train_num_labels)

In [None]:
def count_tokens(text):
    # Tokenize the input text
    tokens = tokenizer.tokenize(text)
    # Return the number of tokens
    return len(tokens)

In [None]:
%%time

# Count the number of tokens in each row and make it its own feature (to be deleted)
train['token_length'] = train['title'].apply(count_tokens)
test['token_length'] = test['title'].apply(count_tokens)

In [None]:
# Fine the biggest token length in train
train.sort_values(by='token_length', ascending=False).head(1)

In [None]:
# Fine the biggest token length in test
test.sort_values(by='token_length', ascending=False).head(1)

In [10]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['title']
        label = self.data.iloc[idx]['vps']
        encoding = tokenizer(text, return_tensors='pt', padding='max_length', 
                            #  truncation=True, # if the max token length is less than 512 this is not needed
                             max_length=self.max_len)
        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # Remove batch dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Assume df is your DataFrame containing 'text' and 'label'
# Found the max_length token (which was 46) and did max_length + 10 for the padding
dataset = CustomDataset(train, tokenizer, max_len=55)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,  # Increased batch size
    per_device_eval_batch_size=32,  # Increased batch size
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=3e-4,
    seed=5,
    gradient_checkpointing=True,  # Enable gradient checkpointing
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

In [None]:
# Save the finetuned model for further use (rename as appropraite)
# Loss score got low from 7.4 to 0.5

model.save_pretrained('./bert-large-uncased_hair_trained_model')
tokenizer.save_pretrained('./bert-large-uncased_hair_trained_model')

- Use the new finetuned model

In [6]:
%%time

# Use the pretrained model for embedding the training and test data

# tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased_trained_model')
# model = AutoModel.from_pretrained('bert-large-uncased_trained_model')

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-large-uncased')
model = AutoModel.from_pretrained('google-bert/bert-large-uncased')

Some weights of the model checkpoint at google-bert/bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CPU times: total: 3.7 s
Wall time: 8.01 s


In [7]:
def get_bert_embeddings(sentences):
    model.eval()  # Put the model in evaluation mode
    batch_size = 64  # Adjust based on your memory availability
    embeddings = []
    
    # Wrap the range generator with tqdm for a progress bar
    for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, 
                           truncation=True, 
                           return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].detach().numpy())
    
    # Concatenate all batch embeddings
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

In [8]:
# Convert the title column into a list

documents_train = train['text'].tolist()
documents_test = test['text'].tolist()

In [9]:
%%time

bert_embeddings = get_bert_embeddings(documents_train)
bert_df_train = pd.DataFrame(bert_embeddings)
bert_df_train.columns = ['bert_' + str(col) for col in bert_df_train.columns]
bert_df_train.head()

Processing batches: 100%|██████████| 306/306 [4:51:56<00:00, 57.25s/it]   


CPU times: total: 12h 7min 35s
Wall time: 4h 52min 3s


Unnamed: 0,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,bert_8,bert_9,...,bert_1014,bert_1015,bert_1016,bert_1017,bert_1018,bert_1019,bert_1020,bert_1021,bert_1022,bert_1023
0,-0.622036,-0.783141,-0.768789,-0.661886,0.028125,0.504465,-0.207343,-0.109943,-0.072224,0.794893,...,0.029174,-0.245753,-0.150378,0.802312,0.281644,0.201897,0.260581,-1.174859,0.127157,-0.27381
1,-0.701897,0.374661,-0.704999,-0.269579,0.633197,0.398946,0.595634,0.220773,0.074318,0.455393,...,0.075826,-0.567909,0.174853,-0.206451,-0.00113,0.299368,-0.053149,-0.23563,0.489391,-0.722882
2,-0.066654,-0.447025,-0.446701,0.109352,0.140808,0.046788,-0.435363,0.35756,0.830479,0.860996,...,0.875746,-0.17748,-0.641226,0.650413,0.383797,0.605456,-0.196379,-0.515951,0.297116,0.061242
3,-0.102057,-0.651902,-0.687253,0.218423,0.070657,0.326936,0.12001,0.004033,0.599281,-0.026284,...,0.326366,-0.657595,-0.287512,0.891334,0.251273,0.203012,-0.006166,-0.761718,-0.435652,0.204584
4,-0.723254,-0.508487,-0.519215,0.261606,0.076165,0.418882,-0.252534,0.405623,0.65315,0.391357,...,0.223673,-0.098319,-0.285708,0.296431,0.532552,0.990138,-0.120088,-1.080273,0.182172,0.078177


In [10]:
bert_df_train.to_csv('bert_large_uncased_baseline_train.csv', index=False)

In [11]:
%%time
# Vectorize the test data

bert_embeddings = get_bert_embeddings(documents_test)
bert_df_test = pd.DataFrame(bert_embeddings)
bert_df_test.columns = ['bert_' + str(col) for col in bert_df_test.columns]
bert_df_test.head()

Processing batches: 100%|██████████| 132/132 [1:58:28<00:00, 53.85s/it]   


CPU times: total: 5h 25min 29s
Wall time: 1h 58min 28s


Unnamed: 0,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,bert_8,bert_9,...,bert_1014,bert_1015,bert_1016,bert_1017,bert_1018,bert_1019,bert_1020,bert_1021,bert_1022,bert_1023
0,-0.237395,-1.099664,-0.073115,0.011293,0.217628,0.800265,-0.337267,-0.240212,-0.078137,0.26603,...,0.324019,-0.386438,-0.742862,1.121769,0.309127,0.823491,0.390766,-0.884747,-0.377234,-0.214326
1,-0.966147,-0.420513,-0.857052,0.046974,0.568425,0.946378,-0.172185,-0.506385,0.672026,0.417897,...,0.68517,-0.477663,-0.984418,1.594175,-0.076365,1.21348,0.119592,-1.103788,-0.447266,0.409644
2,-0.281656,-0.408963,-0.722742,-0.256569,0.387469,0.430409,-0.060564,0.197533,0.254076,0.491151,...,0.287451,-0.456337,-1.118185,0.344271,-0.04469,0.719634,-0.304294,-1.084771,-0.141309,-0.464817
3,-1.034117,-0.712914,-0.519002,0.088926,0.187793,0.306727,0.189207,0.23895,0.309982,0.56378,...,-0.02342,0.405448,-0.544392,1.045816,0.059901,0.686347,-0.31136,-0.188767,0.372833,-0.63484
4,-0.446976,-0.504851,-0.614594,-0.866996,-0.157824,0.780376,0.328414,0.241635,0.21727,0.800562,...,0.184186,-0.050992,-0.419704,0.29503,0.179823,0.512639,-0.280976,-0.473329,-0.14618,-0.09458


In [12]:
bert_df_test.to_csv('bert_large_uncased_baseline_test.csv', index=False)