In [71]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [72]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [73]:
print("device: ", device)

device:  cpu


#### Load the Dataset:

In [74]:
hmn_df = pd.read_csv("../Clean_data/human_wrttn_text.csv")
ai_df = pd.read_csv("../Clean_data/ai_gen_text.csv")

#### Concatenate the Datasets and Change the Feature Type of the Labela:

In [75]:
from datasets import Dataset, ClassLabel, Features, Value
dataset_df = pd.concat([hmn_df, ai_df], ignore_index=True)
print("daatset_df:\n", dataset_df.head())
features_chng = Features({"Text": Value("string"),"label": ClassLabel(num_classes=2,names=[0,1])})
dataset = Dataset.from_pandas(dataset_df, features=features_chng)

daatset_df:
                                                 Text  label
0  12 Years a Slave An Analysis of the Film Essay...      0
1  20 Social Media Post Ideas to Radically Simpli...      0
2  2022 Russian Invasion of Ukraine in Global Med...      0
3  533 US 27 2001 Kyllo v United States The Use o...      0
4  A Charles Schwab Corporation Case Essay\n\nCha...      0


In [76]:
type(dataset[0]["label"])
dataset.features

{'Text': Value(dtype='string', id=None),
 'label': ClassLabel(names=[0, 1], id=None)}

In [77]:
# dataset.class_encode_column('label')
# dataset

#### Split the Datasets into Train and Test Dataset:

In [78]:
dataset_dict = dataset.train_test_split(test_size=0.2, stratify_by_column="label")
small_train_dataset = dataset_dict["train"].shuffle(seed=40).select(range(200))
small_test_dataset = dataset_dict["test"].shuffle(seed=40).select(range(200))

In [79]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['Text', 'label'],
        num_rows: 205268
    })
    test: Dataset({
        features: ['Text', 'label'],
        num_rows: 51318
    })
})

#### Tokenize the Datasets:

In [80]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")



In [81]:
def tokenize_function(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True)

In [82]:
tokenized_sml_trn_ds = small_train_dataset.map(tokenize_function, batched=True)
tokenized_sml_tst_ds = small_test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 200/200 [00:00<00:00, 1564.43 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1677.00 examples/s]


In [83]:
tokenizer.decode(tokenized_sml_trn_ds["input_ids"][0])

'[CLS] based on the given information it is not possible to determine which city has fewer people catching acute bronchitis however it is mentioned that acute bronchitis is caused by bacteria and illnesses caused by bacteria can be treated with antibiotics therefore if large city has more people infected by bacteria and fungi it is possible that they have better access to antibiotics and medical treatment which could result in fewer cases of acute bronchitis however this is just a speculation and cannot be confirmed without further information [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [84]:
# [len(tokenized_sml_trn_ds["input_ids"][i]) for i in range(1000)]

### Train the Model using Native PyTorch:

In [85]:
# Remove the columns named "Text" since the model won't accept the text strings.
tokenized_sml_trn_ds = tokenized_sml_trn_ds.remove_columns(["Text"])
tokenized_sml_tst_ds = tokenized_sml_tst_ds.remove_columns(["Text"])

In [86]:
tokenized_sml_trn_ds
tokenized_sml_tst_ds

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

In [87]:
# Rename the column name "label" to "labels" as this is expected for the model input.
tokenized_sml_trn_ds = tokenized_sml_trn_ds.rename_column("label","labels")
tokenized_sml_tst_ds = tokenized_sml_tst_ds.rename_column("label","labels")

In [88]:
tokenized_sml_trn_ds

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

In [89]:
tokenized_sml_tst_ds

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

#### Set the Data Format to Torch:

In [90]:
tokenized_sml_trn_ds.set_format("torch")
tokenized_sml_tst_ds.set_format("torch")

In [91]:
tokenized_sml_trn_ds["input_ids"][0][:5]

tensor([ 101, 2241, 2006, 1996, 2445])

#### Prepare the DataLoader:


In [92]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_sml_trn_ds, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_sml_tst_ds, shuffle=True, batch_size=8)

In [93]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [94]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [95]:
from transformers import get_scheduler
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [96]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

#### Training Loop:

In [97]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train()
count = 0
for epoch in range(num_epochs): 
    for batch in train_dataloader:
        count += 1
        batch = {k: v.to(device) for k, v in batch.items()}
        # if epoch == 0 and count == 1:
        #     print("batch: ", batch)  
            # break
        # import pdb; pdb.set_trace() 
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 13/13 [30:47<00:00, 142.12s/it]


In [43]:
batch['token_type_ids'].size()

torch.Size([8, 512])

In [38]:
for key in batch.keys():
    print("key:", key) 

key: labels
key: input_ids
key: token_type_ids
key: attention_mask
