In [42]:
import pandas as pd
import numpy as np
import pandas as pd
from transformers import BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch

df = pd.read_excel('comments.xlsx')

In [43]:
df.columns=['comment', 'label']
df['comment'] = df.comment.apply(lambda x: str(x))
df = df[df.comment.apply(lambda x: len(x)) <= 225]

In [56]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df['comment'], df['label'], test_size=0.2)

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=225)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=225)

train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']), 
    torch.tensor(train_encodings['attention_mask']), 
    torch.tensor(train_labels.values).long()
)
val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']), 
    torch.tensor(val_encodings['attention_mask']), 
    torch.tensor(val_labels.values).long()
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [57]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(3):
    model.train()
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0], 
            'attention_mask': batch[1], 
            'labels': batch[2]
        }

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss  # outputs.loss already accounts for the target format
        loss.backward()
        optimizer.step()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual

In [None]:
model.save_pretrained('./comments_reply_model')
tokenizer.save_pretrained('./comments_tokenizer')

In [59]:
df.label.value_counts()

1.0    1307
0.0     579
Name: label, dtype: int64