<a href="https://colab.research.google.com/github/julsCadenas/redditscrape/blob/master/training/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TRAIN

prepare the dataset

from google.colab import drive
drive.mount('/content/drive')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# pip install datasets

In [2]:
import json
dataset_path = '/content/drive/MyDrive/dataset/reddit_data.json'

with open(dataset_path, "r") as f:
    data = json.load(f)

for item in data:
    if isinstance(item.get("summary"), dict):
        item["summary"] = json.dumps(item["summary"], ensure_ascii=False)
    elif item.get("summary") is None:
        item["summary"] = ""

with open("reddit_data_cleaned.json", "w") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

load the dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="reddit_data_cleaned.json")
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'selftext', 'comments', 'index', 'summary'],
        num_rows: 100
    })
})


preprocess the dataset

In [6]:
from transformers import AutoTokenizer

model_path = '/content/drive/MyDrive/models'

tokenizer = AutoTokenizer.from_pretrained(model_path)

def preprocess_dataset(examples):
    # all_text = examples["selftext"] + "\n" + "\n".join(examples["comments"])
    all_text = [selftext + "\n" + "\n".join(comments) for selftext, comments in zip(examples["selftext"], examples["comments"])]
    model_inputs = tokenizer(all_text, max_length=1024, truncation=True, padding=True)
    labels = tokenizer(examples["summary"], max_length=1024, truncation=True, padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_dataset, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

split the dataset

In [7]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_df = tokenized_dataset["train"].to_pandas()

train_split, val_split = train_test_split(train_df, test_size=0.2)

train_dataset = Dataset.from_pandas(train_split)
val_dataset = Dataset.from_pandas(val_split)

dataset2 = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

print(dataset2)

DatasetDict({
    train: Dataset({
        features: ['title', 'selftext', 'comments', 'index', 'summary', 'input_ids', 'attention_mask', 'labels', '__index_level_0__'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['title', 'selftext', 'comments', 'index', 'summary', 'input_ids', 'attention_mask', 'labels', '__index_level_0__'],
        num_rows: 20
    })
})


In [8]:
print(dataset2)

DatasetDict({
    train: Dataset({
        features: ['title', 'selftext', 'comments', 'index', 'summary', 'input_ids', 'attention_mask', 'labels', '__index_level_0__'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['title', 'selftext', 'comments', 'index', 'summary', 'input_ids', 'attention_mask', 'labels', '__index_level_0__'],
        num_rows: 20
    })
})


since the validation set has summaries in it, lets remove it

In [9]:
def remove_summary_from_validation(example):
    if 'summary' in example:
        del example['summary']
    return example

dataset2['validation'] = dataset2['validation'].map(remove_summary_from_validation)
print(dataset2)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'selftext', 'comments', 'index', 'summary', 'input_ids', 'attention_mask', 'labels', '__index_level_0__'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['title', 'selftext', 'comments', 'index', 'input_ids', 'attention_mask', 'labels', '__index_level_0__'],
        num_rows: 20
    })
})


prepare training stuff

In [11]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer
# model_path = '/content/drive/MyDrive/models'
logs_path = '/content/drive/MyDrive/logs'
results_path = '/content/drive/MyDrive/results'
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

training_args = TrainingArguments(
    output_dir=results_path,
    eval_strategy="epoch",
    logging_dir=logs_path,
    logging_steps=5,
    save_steps=0,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    num_train_epochs=32,
    gradient_accumulation_steps=4,
    max_steps=1000,
    push_to_hub=False
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset2["train"],
    eval_dataset=dataset2["validation"]
)



training time

In [None]:
import torch
torch.cuda.empty_cache()

In [12]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,1.0215,0.181126
2,0.5379,0.178971
3,0.48,0.188038
4,0.5832,0.1896
5,0.4131,0.195241
6,0.3796,0.199607
7,0.2963,0.210155
8,0.2101,0.211914
9,0.1876,0.217952
10,0.1752,0.227991




TrainOutput(global_step=1000, training_loss=0.06557443465571851, metrics={'train_runtime': 8469.8876, 'train_samples_per_second': 0.945, 'train_steps_per_second': 0.118, 'total_flos': 1.7336836816896e+16, 'train_loss': 0.06557443465571851, 'epoch': 100.0})

In [13]:
model.save_pretrained('/content/drive/MyDrive/model2')


In [14]:
trainer.evaluate()

{'eval_loss': 0.3143388628959656,
 'eval_runtime': 6.2791,
 'eval_samples_per_second': 3.185,
 'eval_steps_per_second': 1.593,
 'epoch': 100.0}

In [15]:
tokenizer.save_pretrained('/content/drive/MyDrive/model2')

('/content/drive/MyDrive/model2/tokenizer_config.json',
 '/content/drive/MyDrive/model2/special_tokens_map.json',
 '/content/drive/MyDrive/model2/vocab.json',
 '/content/drive/MyDrive/model2/merges.txt',
 '/content/drive/MyDrive/model2/added_tokens.json',
 '/content/drive/MyDrive/model2/tokenizer.json')