In [3]:
! pip install datasets



In [4]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", '3.0.0')
dataset

  table = cls._concat_blocks(blocks, axis=0)


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [12]:
import random
import re
import pandas as pd 

def reverse_sentence(sentence):
    return sentence[::-1]

def clean_text(text):
    """ Clean the text by removing or replacing unwanted characters. """
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = text.replace('\t', ' ').replace('\n', ' ')  # Replace tabs and new lines
    return text

def prepare_data(dataset, num_samples=10000, train_split=0.8, val_split=0.1):
    data = []
    for article in dataset['train']:
        text = clean_text(article['article'])
        sentences = re.split(r'(?<=[.!?]) +', text)
        for sentence in sentences:
            if sentence:
                reversed_sentence = reverse_sentence(sentence)
                data.append((sentence, reversed_sentence))
    
    random.shuffle(data)
    data = data[:num_samples]  # Limit the total number of rows
    df = pd.DataFrame(data, columns=['sentence', 'reversed'])

    # Split the data
    train_size = int(len(df) * train_split)
    val_size = int(len(df) * val_split)
    train_df = df[:train_size]
    val_df = df[train_size:train_size + val_size]
    test_df = df[train_size + val_size:]

    return train_df, val_df, test_df

# Prepare the dataset and split
train_df, val_df, test_df = prepare_data(dataset)


In [13]:
train_df.to_csv('data/train.csv', sep='\t', index=False)
val_df.to_csv('data/val.csv', sep='\t', index=False)
test_df.to_csv('data/test.csv', sep='\t', index=False)

In [17]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

raw_datasets = load_dataset('csv', data_files={'train': 'data/train.csv', 'validation': 'data/val.csv', 'test': 'data/test.csv'}, delimiter='\t', column_names=['input', 'target'])

def preprocess_function(examples):
    inputs = [f"reverse: {sentence}" for sentence in examples['input']]
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target'], max_length=128, padding='max_length', truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/8001 [00:00<?, ? examples/s]



Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

In [52]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

trainer.train()

  0%|          | 0/1503 [00:00<?, ?it/s]

{'loss': 1.9849, 'learning_rate': 1.3346640053226881e-05, 'epoch': 1.0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.8315600156784058, 'eval_runtime': 5.0463, 'eval_samples_per_second': 198.364, 'eval_steps_per_second': 12.484, 'epoch': 1.0}
{'loss': 1.9165, 'learning_rate': 6.69328010645376e-06, 'epoch': 2.0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.7883822917938232, 'eval_runtime': 5.0412, 'eval_samples_per_second': 198.564, 'eval_steps_per_second': 12.497, 'epoch': 2.0}
{'loss': 1.8898, 'learning_rate': 3.992015968063872e-08, 'epoch': 2.99}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.7752636671066284, 'eval_runtime': 5.0264, 'eval_samples_per_second': 199.15, 'eval_steps_per_second': 12.534, 'epoch': 3.0}
{'train_runtime': 528.2259, 'train_samples_per_second': 45.441, 'train_steps_per_second': 2.845, 'train_loss': 1.929679704997354, 'epoch': 3.0}


TrainOutput(global_step=1503, training_loss=1.929679704997354, metrics={'train_runtime': 528.2259, 'train_samples_per_second': 45.441, 'train_steps_per_second': 2.845, 'train_loss': 1.929679704997354, 'epoch': 3.0})

In [53]:
def reverse_with_model(sentence, model, tokenizer, device='mps'):
    # Encode the input text and move tensor to the specified device
    input_ids = tokenizer.encode("reverse: " + sentence, return_tensors="pt").to(device)

    # Generate outputs and ensure the model is on the correct device
    model = model.to(device)
    outputs = model.generate(input_ids)

    # Decode the generated ids
    reversed_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return reversed_sentence

test_sentence = "Hello world"
reversed_sentence = reverse_with_model(test_sentence, model, tokenizer)
print(f"Original: {test_sentence}")
print(f"Reversed: {reversed_sentence}")



Original: Hello world
Reversed: .shtiw eht eht


In [19]:
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/spiece.model',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')