In [2]:
!pip install torch
!pip install transformers
!pip install pandas
!pip install scikit-learn



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

In [5]:
data = pd.read_csv('./combined_data.csv')
data.head()

Unnamed: 0,Text,Label,Model,Original dataset,Row in original dataset
0,While driverless cars present many promising b...,Machine,darragh_claude_v7,essays,13355
1,Homework Clubs: The Key to Unlocking Academic ...,Machine,llama2_chat,essays,7249
2,"""The legalization of marijuana has been a cont...",Machine,falcon_180b_v1,essays,2603
3,Taking the opportunity to learn new things can...,Machine,mistral7binstruct_v1,essays,3993
4,Working with a partner is an effective way fo...,Machine,mistral7binstruct_v2,essays,3773


In [6]:
data['Label'] = data['Label'].apply(lambda x: 0 if x == 'Human' else 1)

In [10]:
sampled_data = data.sample(n=200, random_state=42)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(
    sampled_data['Text'], 
    sampled_data['Label'], 
    train_size=0.7, 
    random_state=42
)

# Print the shapes to verify
print(f"Train X shape: {train_X.shape}")
print(f"Test X shape: {test_X.shape}")
print(f"Train Y shape: {train_Y.shape}")
print(f"Test Y shape: {test_Y.shape}")

Train X shape: (140,)
Test X shape: (60,)
Train Y shape: (140,)
Test Y shape: (60,)


In [12]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [13]:
train_tokens = tokenizer(list(train_X), padding = True, truncation=True)
test_tokens = tokenizer(list(test_X), padding = True, truncation=True)

In [14]:
train_tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [15]:
print(train_tokens['input_ids'][0])
print(tokenizer.decode(train_tokens['input_ids'][0]))

[101, 6510, 1431, 1129, 1682, 1106, 1902, 1147, 1319, 2247, 1933, 1272, 117, 1122, 1660, 1172, 170, 2640, 1106, 1138, 1380, 1106, 1440, 1977, 1106, 1107, 1103, 2247, 119, 2907, 1122, 1156, 1494, 1172, 1243, 170, 2640, 1120, 13992, 1147, 1319, 17980, 119, 6510, 1156, 1129, 1682, 1106, 1138, 1115, 2296, 1115, 1152, 1132, 1682, 1106, 1321, 1113, 1103, 4812, 119, 1220, 1156, 1129, 1682, 1106, 3858, 1167, 1191, 1152, 1169, 2011, 1122, 1155, 1113, 1147, 1319, 119, 1370, 1538, 1651, 1152, 1274, 1204, 1541, 1440, 1977, 1106, 1833, 1933, 1133, 1191, 1128, 1660, 1172, 170, 2640, 1106, 1202, 1122, 1113, 1147, 1319, 1152, 1547, 1243, 1154, 1122, 119, 1752, 117, 13795, 1240, 1319, 1933, 1112, 170, 2377, 1122, 1660, 1128, 1380, 1106, 1440, 1977, 1106, 1107, 1103, 2247, 119, 2082, 1651, 2215, 1313, 1111, 1103, 2247, 1114, 1720, 1106, 1202, 119, 1573, 117, 1191, 1128, 1660, 170, 2377, 170, 2640, 1106, 1902, 1147, 1319, 1933, 1111, 1103, 2247, 1122, 1660, 1172, 170, 2640, 1106, 1202, 1380, 119, 6510, 1

In [16]:
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = train_X
            self.tokens = train_tokens
            self.labels = list(train_Y)
        else:
            self.text_data = test_X
            self.tokens = test_tokens
            self.labels = list(test_Y)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [18]:
batch_size = 2
train_dataset = TokenData(train = True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TokenData(train = False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

In [19]:
bert_model = BertForSequenceClassification.from_pretrained('bert-base-cased') # Pre-trained model
optimizer = AdamW(bert_model.parameters(), lr=1e-5) # Optimization function
loss_fn = torch.nn.CrossEntropyLoss() # Loss function

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [20]:
num_epochs = 3
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
bert_model.to(device) # Transfer model to GPU if available

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
for epoch in range(num_epochs):
    print("Epoch: ",(epoch + 1))
    # TRAINING BLOCK STARTS
    bert_model.train()
    for i,batch in enumerate(train_loader):    
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Setting the gradients to zero
        optimizer.zero_grad()
        
        # Passing the data to the model
        outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        # The logits will be used for measuring the loss
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])

        # Calculating the gradient for the loss function
        loss.backward()
        
        # Optimizing the parameters of the bert model
        optimizer.step()

        # Calculating the running loss for logging purposes
        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size

        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
    # Logging epoch-wise training loss
    print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
    # TRAINING BLOCK ENDS 

    # TESTING BLOCK STARTS
    bert_model.eval()
    correct = 0
    test_pred = []
    for i, batch in enumerate(test_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # We don't need gradients for testing
        with torch.no_grad():
            outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        # Logits act as predictions
        logits = outputs.logits
        
        # Calculating total batch loss using the logits and labels
        loss = loss_fn(logits, batch['labels'])
        test_batch_loss = loss.item()
        
        # Calculating the mean batch loss
        test_last_loss = test_batch_loss / batch_size
        print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))
        
        # Comparing the predicted target with the labels in the batch
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        print("Testing accuracy: ",correct/((i + 1) * batch_size))
    
    print(f"\nTesting epoch {epoch + 1} last loss: ",test_last_loss)
    # TESTING BLOCK ENDS

Epoch:  1
