In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm

In [15]:
data = pd.read_csv('../raw data/combined_data.csv')
data['Label'] = data['Label'].apply(lambda x: 0 if x == 'Human' else 1)
data.head()

Unnamed: 0,Text,Label,Original dataset,Row in original dataset
0,The idea of graduating high school in three ye...,1,essays,26613
1,"Hey, I'm so excited to write this essay about ...",1,essays,26326
2,Introduction\n\nSelf-reliance is a concept tha...,1,essays,30579
3,"Sure, here's my attempt at writing an essay as...",1,essays,33547
4,The legalization of marijuana is a highly deba...,1,essays,33768


In [4]:
sampled_data = data.sample(n=10000, random_state=42)

In [5]:
train_X, test_X, train_Y, test_Y = train_test_split(
    sampled_data['Text'], 
    sampled_data['Label'], 
    train_size=0.7, 
    random_state=42
)

# Print the shapes to verify
print(f"Train X shape: {train_X.shape}")
print(f"Test X shape: {test_X.shape}")
print(f"Train Y shape: {train_Y.shape}")
print(f"Test Y shape: {test_Y.shape}")

Train X shape: (7000,)
Test X shape: (3000,)
Train Y shape: (7000,)
Test Y shape: (3000,)


In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")



In [7]:
train_tokens = tokenizer(list(train_X), padding = True, truncation=True)
test_tokens = tokenizer(list(test_X), padding = True, truncation=True)

In [8]:
train_tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [9]:
print(train_tokens['input_ids'][0])
print(tokenizer.decode(train_tokens['input_ids'][0]))

[101, 16501, 4386, 5132, 113, 107, 1632, 1493, 107, 114, 1110, 1126, 8256, 1705, 1104, 1415, 117, 186, 18413, 20910, 14604, 1233, 1610, 2605, 12198, 2285, 23570, 1115, 2077, 1219, 1103, 19605, 16477, 119, 1109, 1148, 13104, 1104, 1142, 1372, 1127, 1276, 1107, 6411, 1118, 2594, 1987, 10223, 1200, 26673, 117, 1105, 1152, 1127, 1549, 1147, 1271, 1118, 152, 1582, 5213, 1233, 1889, 11463, 1107, 5825, 119, 16501, 4386, 5132, 1127, 1103, 2026, 1657, 3551, 1106, 1518, 2647, 1103, 2746, 117, 1114, 1199, 9985, 16584, 1146, 1106, 1160, 2937, 10992, 1105, 2288, 1120, 1166, 1565, 2759, 3543, 1120, 1103, 2342, 119, 1220, 1127, 18532, 1118, 4672, 24387, 24891, 4832, 1105, 1493, 1115, 1127, 1146, 1106, 1160, 1105, 170, 1544, 2759, 1263, 117, 1543, 1172, 1621, 1103, 2026, 1105, 11112, 1657, 3551, 1104, 1147, 1159, 119, 16501, 4386, 5132, 7672, 1113, 1415, 117, 22245, 15435, 13149, 23570, 117, 1105, 1112, 1216, 1132, 4485, 1112, 6631, 4793, 15334, 1107, 1103, 16969, 2094, 5127, 119, 102, 0, 0, 0, 0, 0, 

In [10]:
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = train_X
            self.tokens = train_tokens
            self.labels = list(train_Y)
        else:
            self.text_data = test_X
            self.tokens = test_tokens
            self.labels = list(test_Y)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [11]:
batch_size = 4
train_dataset = TokenData(train = True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TokenData(train = False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

In [12]:
bert_model = BertForSequenceClassification.from_pretrained('bert-base-cased') # Pre-trained model
optimizer = AdamW(bert_model.parameters(), lr=1e-5) # Optimization function
loss_fn = torch.nn.CrossEntropyLoss() # Loss function

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
num_epochs = 1
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
bert_model.to(device) # Transfer model to GPU if available

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [14]:
for epoch in tqdm(range(num_epochs)):
    print("Epoch: ",(epoch + 1))
    # TRAINING BLOCK STARTS
    bert_model.train()
    for i,batch in enumerate(train_loader):    
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Setting the gradients to zero
        optimizer.zero_grad()
        
        # Passing the data to the model
        outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        # The logits will be used for measuring the loss
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])

        # Calculating the gradient for the loss function
        loss.backward()
        
        # Optimizing the parameters of the bert model
        optimizer.step()

        # Calculating the running loss for logging purposes
        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size

        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
    # Logging epoch-wise training loss
    print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
    # TRAINING BLOCK ENDS 

    # TESTING BLOCK STARTS
    bert_model.eval()
    correct = 0
    test_pred = []
    for i, batch in enumerate(test_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # We don't need gradients for testing
        with torch.no_grad():
            outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        # Logits act as predictions
        logits = outputs.logits
        
        # Calculating total batch loss using the logits and labels
        loss = loss_fn(logits, batch['labels'])
        test_batch_loss = loss.item()
        
        # Calculating the mean batch loss
        test_last_loss = test_batch_loss / batch_size
        print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))
        
        # Comparing the predicted target with the labels in the batch
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        print("Testing accuracy: ",correct/((i + 1) * batch_size))
    
    print(f"\nTesting epoch {epoch + 1} last loss: ",test_last_loss)
    # TESTING BLOCK ENDS

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:  1
Training batch 1 last loss: 0.18734529614448547
Training batch 2 last loss: 0.1866150051355362
Training batch 3 last loss: 0.14533594250679016
Training batch 4 last loss: 0.147320955991745
Training batch 5 last loss: 0.18233612179756165
Training batch 6 last loss: 0.187950000166893
Training batch 7 last loss: 0.15937501192092896
Training batch 8 last loss: 0.17763705551624298
Training batch 9 last loss: 0.16404762864112854
Training batch 10 last loss: 0.16660204529762268
Training batch 11 last loss: 0.1807909607887268
Training batch 12 last loss: 0.19744455814361572
Training batch 13 last loss: 0.15474480390548706
Training batch 14 last loss: 0.16219180822372437
Training batch 15 last loss: 0.13685107231140137
Training batch 16 last loss: 0.20292365550994873
Training batch 17 last loss: 0.20930646359920502
Training batch 18 last loss: 0.16511917114257812
Training batch 19 last loss: 0.19898006319999695
Training batch 20 last loss: 0.188750222325325
Training batch 21 last loss

100%|██████████| 1/1 [08:44<00:00, 524.93s/it]

Testing batch 750 loss: 0.02818208932876587
Testing accuracy:  0.803

Testing epoch 1 last loss:  0.02818208932876587





In [None]:
torch.save(bert_model.state_dict(), 'bert.pt')
