In [1]:
# pip install evaluate

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
print("device: ", device)

device:  cpu


#### Load the Dataset:

In [5]:
hmn_df = pd.read_csv("../Clean_data/human_wrttn_text.csv")
ai_df = pd.read_csv("../Clean_data/ai_gen_text.csv")

#### Concatenate the Datasets and Change the Feature Type of the Labela:

In [6]:
from datasets import Dataset, ClassLabel, Features, Value
dataset_df = pd.concat([hmn_df, ai_df], ignore_index=True)
print("daatset_df:\n", dataset_df.head())
features_chng = Features({"Text": Value("string"),"label": ClassLabel(num_classes=2,names=[0,1])})
dataset = Dataset.from_pandas(dataset_df, features=features_chng)

daatset_df:
                                                 Text  label
0  12 Years a Slave An Analysis of the Film Essay...      0
1  20 Social Media Post Ideas to Radically Simpli...      0
2  2022 Russian Invasion of Ukraine in Global Med...      0
3  533 US 27 2001 Kyllo v United States The Use o...      0
4  A Charles Schwab Corporation Case Essay\n\nCha...      0


In [7]:
type(dataset[0]["label"])
dataset.features

{'Text': Value(dtype='string', id=None),
 'label': ClassLabel(names=[0, 1], id=None)}

In [8]:
# dataset.class_encode_column('label')
# dataset

#### Split the Datasets into Train and Test Dataset:

In [9]:
train_points = 10
val_points = int(int(train_points/0.8) * 0.2)
print("train_points: ", train_points, "\nval_points: ", val_points)
dataset_dict = dataset.train_test_split(test_size=0.2, stratify_by_column="label")
small_train_dataset = dataset_dict["train"].shuffle(seed=40).select(range(train_points))
small_test_dataset = dataset_dict["test"].shuffle(seed=40).select(range(val_points))

train_points:  10 
val_points:  2


In [37]:
small_train_dataset["Text"]

['Air France Company’s Costs and Revenue Structure Report\n\nTable of Contents\n 1 Introduction\n 2 Direct Operating Costs\n 3 Indirect Airline Operating Costs\n 4 Revenues\n 5 Conclusion\n 6 References\n\nIntroduction\n\nFor a long period the airline business has been categorized as ‘lucrative’ The notion that the airline business is lucrative is partly perpetuated by the fact that it is the most costly form of transport Also the huge capital base of the airline business makes it one of the most stable types of business For instance the assets of an airline at any given time include planes hangers and cargo bays which translate into a lot of capital base for an airline\n\nConsequently various airlines utilize a set of factors when they are coming up with different costs for their products and services However sources of revenue for an airline remain the most prominent determinant of cost This is also the reason why there are big disparities in costs between different airlines This rep

In [38]:
small_train_dataset["label"]

[0, 0, 0, 0, 0, 0, 1, 1, 1, 0]

In [10]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['Text', 'label'],
        num_rows: 205268
    })
    test: Dataset({
        features: ['Text', 'label'],
        num_rows: 51318
    })
})

#### Tokenize the Datasets:

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")



In [12]:
def tokenize_function(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True)

In [13]:
tokenized_sml_trn_ds = small_train_dataset.map(tokenize_function, batched=True)
tokenized_sml_tst_ds = small_test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map: 100%|██████████| 10/10 [00:00<00:00, 191.44 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 133.32 examples/s]


In [14]:
tokenizer.decode(tokenized_sml_trn_ds["input_ids"][0])

'[CLS] air france company ’ s costs and revenue structure report table of contents 1 introduction 2 direct operating costs 3 indirect airline operating costs 4 revenues 5 conclusion 6 references introduction for a long period the airline business has been categorized as ‘ lucrative ’ the notion that the airline business is lucrative is partly perpetuated by the fact that it is the most costly form of transport also the huge capital base of the airline business makes it one of the most stable types of business for instance the assets of an airline at any given time include planes hangers and cargo bays which translate into a lot of capital base for an airline consequently various airlines utilize a set of factors when they are coming up with different costs for their products and services however sources of revenue for an airline remain the most prominent determinant of cost this is also the reason why there are big disparities in costs between different airlines this report seeks to id

In [15]:
# [len(tokenized_sml_trn_ds["input_ids"][i]) for i in range(1000)]

### Train the Model using Native PyTorch:

In [16]:
# Remove the columns named "Text" since the model won't accept the text strings.
tokenized_sml_trn_ds = tokenized_sml_trn_ds.remove_columns(["Text"])
tokenized_sml_tst_ds = tokenized_sml_tst_ds.remove_columns(["Text"])

In [17]:
tokenized_sml_trn_ds
tokenized_sml_tst_ds

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2
})

In [18]:
# Rename the column name "label" to "labels" as this is expected for the model input.
tokenized_sml_trn_ds = tokenized_sml_trn_ds.rename_column("label","labels")
tokenized_sml_tst_ds = tokenized_sml_tst_ds.rename_column("label","labels")

In [19]:
tokenized_sml_trn_ds

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10
})

In [20]:
tokenized_sml_tst_ds

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2
})

#### Set the Data Format to Torch:

In [21]:
tokenized_sml_trn_ds.set_format("torch")
tokenized_sml_tst_ds.set_format("torch")

In [22]:
tokenized_sml_trn_ds["input_ids"][0][:5]

tensor([ 101, 2250, 2605, 2194, 1521])

#### Prepare the DataLoader:


In [23]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_sml_trn_ds, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_sml_tst_ds, shuffle=True, batch_size=8)

In [24]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [26]:
from transformers import get_scheduler
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [27]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

#### Training Loop:

In [28]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train()
count = 0
for epoch in range(num_epochs): 
    for batch in train_dataloader:
        count += 1
        batch = {k: v.to(device) for k, v in batch.items()}
        # if epoch == 0 and count == 1:
        #     print("batch: ", batch)  
            # break
        # import pdb; pdb.set_trace() 
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 2/2 [00:06<00:00,  2.93s/it]

In [29]:
batch['token_type_ids'].size()

torch.Size([2, 512])

In [30]:
for key in batch.keys():
    print("key:", key) 

key: labels
key: input_ids
key: token_type_ids
key: attention_mask


### Evaluation Phase:

In [31]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.5}

In [32]:
logits

tensor([[ 0.2564, -0.2105],
        [ 0.9349, -0.1783]])

In [33]:
torch.argmax(logits, dim=-1)

tensor([0, 0])

In [34]:
a = torch.randn(4, 5)
print("a: ", a)
torch.argmax(a, dim=-1)

a:  tensor([[-1.2056,  0.6011, -0.0492,  0.8874,  0.6192],
        [ 0.1468,  0.8607, -0.1618, -1.0110, -0.7793],
        [ 1.3339, -0.4117,  0.6538, -1.8481, -0.6404],
        [ 1.6501, -1.2440,  0.8982, -0.9876,  0.9745]])


tensor([3, 1, 0, 0])

### To-do: Code Verification

In [35]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.7}