In [35]:
#!pip install -q transformers
#!pip3 install torch torchvision torchaudio

In [36]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [37]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [38]:
print("device: ", device)

device:  cpu


In [39]:
hmn_df = pd.read_csv("../Clean_data/human_wrttn_text.csv")
ai_df = pd.read_csv("../Clean_data/ai_gen_text.csv")

In [40]:
ai_df.head()
# len(ai_df.index)

Unnamed: 0,Text,label
0,The review is neutral The reviewer did not hav...,1
1,Okay lets solve this math problem together \n\...,1
2,As an AI I understand you are asking for a twe...,1
3,The sentence is acceptable It means that the s...,1
4,The article does not provide the last name of ...,1


In [41]:
hmn_df.head()
# len(hmn_df.index)

Unnamed: 0,Text,label
0,12 Years a Slave An Analysis of the Film Essay...,0
1,20 Social Media Post Ideas to Radically Simpli...,0
2,2022 Russian Invasion of Ukraine in Global Med...,0
3,533 US 27 2001 Kyllo v United States The Use o...,0
4,A Charles Schwab Corporation Case Essay\n\nCha...,0


In [42]:
##Drop the index columns:
# ai_df_nw = ai_df.drop(["Unnamed: 0"], axis = 1)
# hmn_df_nw = hmn_df.drop(["Unnamed: 0"], axis = 1)
## Rename the column names:
# ai_df_nw = ai_df.rename({"output":"Text"}, axis = 1)
# hmn_df_nw = hmn_df.rename({"TEXT":"Text"}, axis = 1)

In [43]:
# ai_df_nw.head()
# hmn_df_nw.head()

In [44]:
# ai_df_nw.to_csv("../Clean_data/ai_gen_text_v3.csv", index = False)
# hmn_df_nw.to_csv("../Clean_data/human_wrttn_text_v3.csv", index = False)

In [45]:
#Training variables:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [46]:
# https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb#scrollTo=PkDGqarcPowL
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.Text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [47]:
train_size = 0.8
#Create Training Datasets:
train_dataset_hmn = hmn_df.sample(frac = train_size, random_state = 200)
train_dataset_ai = ai_df.sample(frac = train_size, random_state = 200)
train_dataset = pd.concat([train_dataset_hmn, train_dataset_ai], ignore_index = True)
train_dataset = train_dataset.reset_index(drop = True)
#Create Test Datasets:
test_dataset_hmn = hmn_df.drop(train_dataset_hmn.index)
test_dataset_ai = ai_df.drop(train_dataset_ai.index)
test_dataset = pd.concat([test_dataset_hmn, test_dataset_ai], ignore_index = True)
train_dataset = train_dataset.reset_index(drop = True)

# print("FULL Dataset: {}".format(new_df.shape))
print("FULL human Text Dataset: {}".format(hmn_df.shape),"FULL AI Text Dataset: {}".format(ai_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL human Text Dataset: (128293, 2) FULL AI Text Dataset: (128293, 2)
TRAIN Dataset: (205268, 2)
TEST Dataset: (51318, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)