## Imports

In [1]:
import pandas as pd
import numpy as np
import os
import re
import pickle
import time
import nltk
import itertools
import torch
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers

## OLD VERSIOn

In [None]:
original_articles = pd.read_csv('medium.csv')

In [None]:
def preprocess(article):
    # delete urls
    url_pattern = re.compile(
        r"(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*")
    article = re.sub(url_pattern, '', article)

    # reorder paragraphs
    paragraphs = article.split('\n')
    processed_paragraphs = []
    temp_paragraph = ""
    for number, paragraph in enumerate(paragraphs):
        # remove heading
        if number == 0:
            if len(nltk.tokenize.sent_tokenize(paragraph)) == 1:
                continue

        if paragraph.lower().startswith('image source') \
                or paragraph.lower().startswith('source') \
                or paragraph.lower().startswith('photo by'):
            continue
        elif paragraph == '':
            continue

        if len(paragraph) < 55:  # probabliy it is a subheading
            continue

        if len(paragraph) < 450 and temp_paragraph == "":  # the paragraph is too small
            temp_paragraph = paragraph
        elif temp_paragraph != "":
            temp_paragraph += paragraph
            temp_paragraph += " "
            if len(temp_paragraph) >= 500:
                processed_paragraphs.append(temp_paragraph)
                temp_paragraph = ""
        else:
            processed_paragraphs.append(paragraph)

    return processed_paragraphs

In [None]:
def merge_corresponding_authors(row):
    number = row[1]
    text = row[0]
    true_paragraphs = preprocess(original_articles['0'][number])

    artificial_paragraphs = text.split('\n')

    artificial_paragraphs_temp = []
    for i in range(len(artificial_paragraphs)):
        if len(artificial_paragraphs[i]) > 5:
            artificial_paragraphs_temp.append(artificial_paragraphs[i])
    artificial_paragraphs = artificial_paragraphs_temp

    with open(f"{path}/picks_{number}_1.txt", "r") as picks_file:
        picks = picks_file.read()
        picks = [int(i) for i in picks.split(" ")[:-1]]

    # we put in this array first element of every pair of adjacent picks
    if_there_adjacent_picks = []
    for j in range(len(picks)):
        if j < len(picks) - 1 and picks[j + 1] == picks[j] + 1:
            if_there_adjacent_picks.append(picks[j])

    human_paragraphs = []
    for idx in range(len(true_paragraphs)):
        if idx not in picks:
            human_paragraphs.append(true_paragraphs[idx])

    human_idx = 0
    artifical = 0

    authors_in_artificial = []
    # 0 - human
    # 1 - machine

    punktuation = re.compile(r"[’\'\"“]")
    while artifical < len(artificial_paragraphs):
        text1 = re.sub(punktuation, '', artificial_paragraphs[artifical].lower().strip())
        if human_idx < len(human_paragraphs):
            text2 = re.sub(punktuation, '', human_paragraphs[human_idx].lower().strip())
        if human_idx < len(human_paragraphs) and text1[:10] == text2[:10]:
            authors_in_artificial.append(0)
            human_idx += 1
        else:
            authors_in_artificial.append(1)
        artifical += 1

    idx = 0
    adjacent_idx = 0
    llm_paragraphs = []
    authors = []
    if len(artificial_paragraphs) > len(true_paragraphs) and if_there_adjacent_picks == []:
        while idx < len(artificial_paragraphs):
            if authors_in_artificial[idx] == 0:
                llm_paragraphs.append(artificial_paragraphs[idx])
                authors.append(0)
            else:
                paragraph = artificial_paragraphs[idx]
                while idx < len(artificial_paragraphs) - 1 and authors_in_artificial[idx + 1] == 1:
                    idx += 1
                    paragraph += artificial_paragraphs[idx]
                llm_paragraphs.append(paragraph)
                authors.append(1)
            idx += 1
    elif len(artificial_paragraphs) == len(true_paragraphs):
        llm_paragraphs = artificial_paragraphs
        authors = authors_in_artificial
    elif len(artificial_paragraphs) > len(true_paragraphs) and if_there_adjacent_picks != []:
        while idx < len(artificial_paragraphs):
            if authors_in_artificial[idx] == 0:
                llm_paragraphs.append(artificial_paragraphs[idx])
                authors.append(0)
            else:
                if idx == if_there_adjacent_picks[adjacent_idx]:
                    paragraph = artificial_paragraphs[idx]
                    llm_paragraphs.append(paragraph)
                    authors.append(1)
                    idx += 1
                    paragraph = artificial_paragraphs[idx]
                    while idx < len(artificial_paragraphs) - 1 and authors_in_artificial[idx + 1] == 1:
                        idx += 1
                        paragraph += artificial_paragraphs[idx]
                    llm_paragraphs.append(paragraph)
                    authors.append(1)
            idx += 1
    return llm_paragraphs, authors


In [None]:
corpus = []
path = "./homevozniukgenerated_documents"
for file in os.listdir(path):
    filename = os.fsdecode(file)
    if filename.startswith("output"):
        number = int(filename.split("_", maxsplit=1)[1].split("_")[0])
        if number > 1000:
            continue
        with open(f"{path}/{filename}", 'r') as read_file:
            paragraphs = read_file.readlines()
        text = "".join(paragraphs)
        corpus.append({"text": text, "number": number})

In [None]:
df = pd.DataFrame(data=corpus, columns=['text', 'number'])

In [None]:
import torch
# check if we have cuda installed
if torch.cuda.is_available():
    # to use GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('GPU is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 4 GPU(s) available.
GPU is: NVIDIA A100-SXM4-40GB


In [None]:
df['paragraphs'] = df.apply(merge_corresponding_authors, axis=1)

In [None]:
df['authors'] = df.apply(lambda x: x['paragraphs'][1], axis=1)

In [None]:
df['paragraphs'] = df.apply(lambda x: x['paragraphs'][0], axis=1)

In [None]:
df.sort_values('number')

Unnamed: 0,text,number,paragraphs,authors
25,Data visualizations are key players in data sc...,1,[Data visualizations are key players in data s...,"[0, 1, 0, 0, 1, 0]"
9,Microsoft recently released a New Dialogue Ran...,2,[Microsoft recently released a New Dialogue Ra...,"[0, 0, 1, 1, 0, 0]"
178,"I remember a time in my childhood, I was about...",3,"[I remember a time in my childhood, I was abou...","[0, 1, 0, 0, 1, 0]"
10,Winter is coming and theres a good chance you ...,4,[Winter is coming and theres a good chance you...,"[0, 0, 1, 0, 1, 0]"
202,How Ernesto Miranda’s case came before the Sup...,5,[How Ernesto Miranda’s case came before the Su...,"[0, 1, 0, 0, 0, 1]"
...,...,...,...,...
130,Ive wanted to be a published author for as lon...,248,[Ive wanted to be a published author for as lo...,"[0, 1, 0, 1, 0, 1]"
239,Creativity and joy make part of my life. I lov...,250,[Creativity and joy make part of my life. I lo...,"[0, 1, 0, 1, 1]"
66,Recently I did something I shouldnt have had t...,251,[Recently I did something I shouldnt have had ...,"[0, 1, 0, 1, 1, 0]"
90,"Im never going to be that kind of writer, I ex...",252,"[Im never going to be that kind of writer, I e...","[0, 0, 1, 1]"


In [None]:
failed = [17,
18,
61,
117,
126,
130,
139,
126,
136,
156,
171,
208,
213,
214,
220,
228,
232,
239,
241,
249,]

In [None]:
for i in failed:
    df = df[df.number != i]

In [None]:
splitted_df = df.explode(['paragraphs', 'authors'])

In [None]:
splitted_df['paragraphs'].apply(len)

0      587
0      633
0      553
0      499
0      694
      ... 
239    546
239    531
239    690
239    686
239    541
Name: paragraphs, Length: 1211, dtype: int64

## FINETUNING FOR PARAGRAPHS

In [3]:
splitted_df = pd.read_csv('df_for_finetuning_large.csv')

In [43]:
numbers = splitted_df.number.values.tolist()[:10000]

In [4]:
features = splitted_df.paragraphs.values.tolist()[:10000]
targets = splitted_df.authors.values.tolist()[:10000]

In [5]:
updated_features = ["[CLS] " + str(text) for text in features]

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [7]:
MAX_LEN = 511
tokenized_feature = tokenizer.batch_encode_plus(
                            # Sentences to encode
                            updated_features,
                            # Add empty tokens if len(text)<MAX_LEN
                            padding = 'max_length',
                            # Truncate all sentences to max length
                            truncation=True,
                            # Set the maximum length
                            max_length = MAX_LEN,
                            # Return attention mask
                            return_attention_mask = True,
                            # Return pytorch tensors
                            return_tensors = 'pt'
                   )

In [None]:
# Use 80% for training and 20% for validation
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(tokenized_feature['input_ids'],
                                                                                                      targets, tokenized_feature['attention_mask'],
                                                                                                      random_state=2018, test_size=0.3)

In [None]:
# define batch_size
batch_size = 8

In [None]:
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our test set
validation_data = TensorDataset(validation_inputs, validation_masks, torch.tensor(validation_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [51]:
# let's get the same splitting but for documents number
numbers = splitted_df.number.values.tolist()[:10000]
train_df, test_df = train_test_split(splitted_df[:10000],random_state=2018, test_size=0.3)

numbers_sampler = SequentialSampler(test_df.number.to_numpy())
numbers_dataloader = DataLoader(test_df.number.to_numpy(), sampler=numbers_sampler, batch_size=batch_size)

defining XLMRoberta

In [25]:
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    # Specify number of classes
    num_labels = 2,
    # Whether the model returns attentions weights
    output_attentions = False,
    # Whether the model returns all hidden-states
    output_hidden_states = True
)

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

Using device: cuda



In [27]:
model = model.cuda()

In [28]:
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 1e-5,
                  eps = 1e-8
                )
# Number of training epochs
epochs = 4
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 3,
                                            num_training_steps = total_steps)

loss_fn = torch.nn.CrossEntropyLoss()

In [29]:
# Store the average loss after each epoch
loss_values = []
# number of total steps for each epoch
print('total steps per epoch: ',  len(train_dataloader) / batch_size)
# looping over epochs
for epoch_i in range(0, epochs):

    print('training on epoch: ', epoch_i)
    # set start time
    t0 = time.time()
    # reset total loss
    total_loss = 0
    # model in training
    model.train()
    # loop through batch
    for step, batch in enumerate(train_dataloader):
        # Progress update every 50 step
        if step % 50 == 0 and not step == 0:
            print('training on step: ', step)
            print('total time used is: {0:.2f} s'.format(time.time() - t0))
        # load data from dataloader
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # clear any previously calculated gradients
        model.zero_grad()
        # get outputs
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)
        # get loss
        loss = outputs[0]
        # total loss
        total_loss += loss.item()
        loss.backward()
        # clip the norm of the gradients to 1.0.
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # update optimizer
        optimizer.step()
        # update learning rate
        scheduler.step()
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("average training loss: {0:.2f}".format(avg_train_loss))

total steps per epoch:  109.375
training on epoch:  0
training on step:  50
total time used is: 40.11 s
training on step:  100
total time used is: 78.86 s
training on step:  150
total time used is: 118.72 s
training on step:  200
total time used is: 159.64 s
training on step:  250
total time used is: 200.26 s
training on step:  300
total time used is: 241.02 s
training on step:  350
total time used is: 281.75 s
training on step:  400
total time used is: 322.44 s
training on step:  450
total time used is: 363.08 s
training on step:  500
total time used is: 403.75 s
training on step:  550
total time used is: 444.40 s
training on step:  600
total time used is: 485.03 s
training on step:  650
total time used is: 525.59 s
training on step:  700
total time used is: 566.19 s
training on step:  750
total time used is: 606.88 s
training on step:  800
total time used is: 647.48 s
training on step:  850
total time used is: 688.10 s
average training loss: 0.15
training on epoch:  1
training on ste

saving model with pickle

In [None]:
pickle.dump(model, open('/content/drive/MyDrive/model_new.pkl', 'wb'))

In [None]:
torch.cuda.empty_cache()

In [30]:
last_hidden_states = []

In [31]:
import numpy as np
t0 = time.time()
# model in validation mode
model.eval()
# save prediction
predictions,true_labels, logits_arr =[],[], []
# evaluate data for one epoch
for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # validation
    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask, output_hidden_states=True)
    # get output
    logits = outputs[0]
    # move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()


    last_layer_hidden_states = outputs.hidden_states[-1]
    cls_embeddings = last_layer_hidden_states[:, 0]
    last_hidden_states.append(cls_embeddings)
    final_prediction = np.argmax(logits, axis=-1).flatten()
    predictions.append(final_prediction)
    true_labels.append(label_ids)

print('total time used is: {0:.2f} s'.format(time.time() - t0))

total time used is: 91.76 s


## CHECKING QUALITY

In [35]:
print(predictions)
final_prediction_list = np.concatenate(predictions)
final_truelabel_list = np.concatenate(true_labels)

In [36]:
cr = classification_report(final_truelabel_list,
                           final_prediction_list,
                           output_dict=False)
print(cr)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1597
           1       1.00      0.99      0.99      1403

    accuracy                           0.99      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       0.99      0.99      0.99      3000



seems good!

## GETTING DISTANCES

In [131]:
last_hidden_states_cpu = [tensor.to('cpu') for tensor in last_hidden_states]

In [145]:
document_info = {}


for number_batches, embeds_batches, preds_batches in zip(numbers_dataloader, last_hidden_states_cpu, predictions):
  for number, embed, preds in zip(number_batches.numpy(), embeds_batches, preds_batches):
    info_pair = (embed, preds)
    if document_info.get(number):
      document_info[number].append(info_pair)
    else:
      document_info[number] = [(info_pair)]

In [167]:
cos = torch.nn.CosineSimilarity(dim=0)

In [157]:
distances = []

for doc_num, paragraphes_info in document_info.items():
  pairs = list(itertools.combinations(paragraphes_info, 2))
  for paragraph_pairs in pairs:
    current_distance = cos(paragraph_pairs[0][0], paragraph_pairs[1][0])
    distances.append({"number": doc_num, "cosine": current_distance.to('cpu').numpy(), "label1": paragraph_pairs[0][1], "label2": paragraph_pairs[1][1]})

In [159]:
distances_df = pd.DataFrame(distances, columns = ['number', 'cosine', 'label1', 'label2'])

In [168]:
distances_df.head(10)

Unnamed: 0,number,cosine,label1,label2
0,1125,0.9896549,1,1
1,1125,-0.43046382,1,0
2,1125,-0.49120232,1,0
3,1147,0.938515,1,1
4,4201,0.99117595,1,1
5,4779,0.9670364,1,1
6,2056,-0.54217404,0,1
7,740,0.9904757,0,0
8,1627,-0.40273845,1,0
9,1627,-0.41858622,1,0


In [166]:
distances_df.to_csv('paragraphs_cosine_distances.csv')