<a href="https://colab.research.google.com/github/kokorikos212/Odysseus_Elytis_model/blob/main/ELytis_finetune2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

from datasets import Dataset

import pandas as pd

In [None]:
import os
from transformers import AutoTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
folder_path = "/content/drive/MyDrive/poetry_segments"
# Read all text files from the folder
texts = []
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            texts.append(file.read())

# Create a DataFrame with the texts
df = pd.DataFrame({'text': texts})
dataset = Dataset.from_pandas(df)
print(f"Number of samples: {len(dataset)}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# Set the pad_token to be the eos_token
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=1024)

# Apply the tokenizer to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])


In [None]:
# Split the dataset
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)  # Adjust test_size as needed

train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [None]:
train_dataset.column_names

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForCausalLM

# Load the model
model = AutoModelForCausalLM.from_pretrained('gpt2')

In [None]:
training_args = TrainingArguments(
    output_dir="./elytis_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    remove_unused_columns=False
)
print(tokenized_dataset[0])


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start fine-tuning
trainer.train()

In [None]:
from google.colab import files
import shutil

# Compress the directory
shutil.make_archive('/content/model_archive', 'zip', '/content/model_directory')

# Download the zip file
files.download('/content/model_archive.zip')

In [None]:
# Prepare input text
def preprocess_input(text, tokenizer):
    return tokenizer(text, return_tensors="pt", padding=True, truncation=True)

text = "Καλώς εχό"
inputs = preprocess_input(text, tokenizer)


In [None]:
def get_predictions(inputs, model):
    with torch.no_grad():
        outputs = model(**inputs)
        # Assuming the model is a classification model
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    return predictions

# Get model predictions
predictions = get_predictions(inputs, model)
print(predictions)



tensor([[  198,   138, 32830,   138,   234,   138,  7377,   118,   157,   234,
         17394,   234, 35558]])


RuntimeError: a Tensor with 13 elements cannot be converted to Scalar

In [None]:
# Parameters for text generation
max_length = 200  # Max length of the generated text
temperature = 0.7  # Sampling temperature to control randomness
top_k = 50  # Top-K sampling for more diverse generation
top_p = 0.9  # Top-p sampling (nucleus sampling)

In [None]:
def generate_text(prompt, model, tokenizer, max_length=2000, temperature=0.7, top_k=50, top_p=0.9):
    # Tokenize the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    # Generate text using the model
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        no_repeat_ngram_size=2,  # Avoid repeating phrases
        do_sample=True,  # Enable sampling
    )

    # Decode the generated tokens into a human-readable string
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [None]:
# Example prompt to generate Elytis-like text
prompt = "Έτσι μιλώ για σένα και για μένα"

# Generate and print the text
elytis_like_text = generate_text(prompt, model, tokenizer, max_length, temperature, top_k, top_p)
print(elytis_like_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Έτσι μιλώ για σένα και για μένα τοῦ πρόποτής διηταδυσαχείαν οὐτὰ εὼ λαρτος υπθξασάζοι ἐκΤβΛΡεΣΙσεἱ ηὦτι χγριον αὴ θεὶ Καττρεστες μοοᾶς τẓ Πολλαὸς ὅτων φνομισκύτψα


In [None]:
def format_as_poetry(text):
    lines = text.split('.')
    formatted_poetry = '\n'.join([line.strip() for line in lines if line.strip()])
    return formatted_poetry

# Format the generated text as poetry
poetic_text = format_as_poetry(elytis_like_text)
print(poetic_text)