# <span style="color:cyan">DEFINITION OF THE WORKING DIRECTORY


In [1]:
pwd #Definition of the working directory

'c:\\Users\\Kosta\\Desktop\\Python_Projects\\JUPYTER_PROJECT\\NLP Model'

# <span style="color:cyan">IMPORT OF THE REQUIRED LIBRARIES

In [7]:
# File handling, paths, and JSON
import json  # Used for parsing JSON data.
import os    # Provides a way of using operating system dependent functionality like reading or writing to the file system.

# Text processing
import re  # Regular expressions for text processing.
import nltk  # Natural Language Toolkit for working with human language data.
from nltk.tokenize import word_tokenize  # Used for tokenizing text into words or sentences.
from nltk.stem import WordNetLemmatizer  # Used for reducing words to their base or root form.
from nltk.corpus import stopwords, wordnet  # Stopwords are commonly used words that are usually filtered out before processing. Wordnet is a lexical database for the English language.
from nltk import pos_tag  # Part-of-speech tagging for a given word.

# Statistical analysis
import statistics as stat  # Provides functions to perform mathematical statistics on numeric (Real-valued) data.

# Orthographic checking
from autocorrect import Speller  # Used for automatic spell correction.

# Random selections (sampling)
import random as rd  # Supports the generation of random numbers, sequences, and sampling.

# Data analysis and CSV files
import pandas as pd  # Provides high-performance, easy-to-use data structures, and data analysis tools.
import csv  # Used for reading and writing CSV files.
import warnings  # Used to control the warnings generated by different modules.

# Machine Learning and Text Generation
from sklearn.model_selection import train_test_split # Used to split arrays or matrices into random train and test subsets.
from transformers import GPT2Tokenizer, GPT2LMHeadModel # GPT-2 tokenizer and model for text generation and understanding.
import tensorflow as tf # An end-to-end open-source platform for machine learning.
import gradio as gr # Gradio is a Python library that allows you to quickly create customizable UI components for machine learning models.
from transformers import pipeline  # Pipelines are a simple way to use models for common tasks.
from transformers import Trainer # Trainer is a high-level class that encapsulates the training loop.
from transformers import GPT2LMHeadModel, GPT2Tokenizer # GPT-2 tokenizer and model for text generation and understanding.
from transformers import TFGPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling # TextDataset and DataCollatorForLanguageModeling are used for preparing the dataset and batching the data for training.
from transformers import TFAutoModelForCausalLM, AutoTokenizer
from transformers import TrainingArguments # TrainingArguments contains all the hyperparameters needed for training the model.




# <span style="color:cyan">REMOVAL OF PYTHON WARNINGS

In [8]:
# Deactivation of All Warnings
warnings.filterwarnings("ignore")

# <span style="color:cyan">DATA PROCESSING

<span style="color:orange">Step 1: DEFINITION OF DIRECTORIES AND ENSURING THE FILES

In [9]:
inputFolder = "../Dataset/IData/"  # Define the input directory where the files are located.
all_quotes = []  # Create a list to store all the quotes.

for index, fileName in enumerate(os.listdir(inputFolder)):  # Iterate over each file in the input directory.
    if fileName.endswith(".txt"):  # Check if the file is a .txt file.
        philosopher = os.path.splitext(fileName)[0].lower()  # Extract the philosopher's name from the file name and convert it to lowercase.
        joinedPath = os.path.join(inputFolder, fileName)  # Create the full path to the file.

        with open(joinedPath, "r", encoding="utf-8", errors="replace") as initialFile:  # Open the file with UTF-8 encoding and replace any errors.
            quoteString = initialFile.readlines()  # Read all lines from the file.

        # Print the first 5 lines of the file for inspection.
        print(f"File: {fileName}")
        print("First 5 lines:")
        for line in quoteString[:5]:
            print(line.strip())  # Print each line after stripping leading/trailing whitespace.
        print("-" * 50)  # Print a line of dashes for better readability.

        # Add the philosopher's name and the quotes to the overall list of quotes.
        all_quotes.append((philosopher, quoteString))

File: Aristotle.txt
First 5 lines:
"Be a free thinker and don't accept everything you hear as truth. Be critical and evaluate what you believe in."
"Excellence is never an accident. It is always the result of high intention, sincere effort, and intelligent execution; it represents the wise choice of many alternatives - choice, not chance, determines your destiny."
"To appreciate the beauty of a snowflake, it is necessary to stand out in the cold."
"He who cannot be a good follower cannot be a good leader."
"The weak are always anxious for justice and equality. The strong pay no heed to either."
--------------------------------------------------
File: Artemiou.txt
First 5 lines:
"Many, are so concerned for the future, that they forget the present - from which everything is defined."
"Finding a reason to die, is easier than finding a reason to live."
"The only reason we are alive, is the reason itself."
"When one is young, lives in bravery* but when old, lives in fear."
"Fame, is a rewar

<span style="color:orange">Step 2: CREATION OF A DICTIONARY WITH QUOTES AND THEIR AUTHORS

In [10]:
data = {}
for philosopher, quoteString in all_quotes:  # Traverse the list of quotes
    for k, text in enumerate(quoteString):
        quote = text.strip('\n').replace('"', "")  # Remove newline characters and quotes from the text
        data[len(data) + 1] = {"author": philosopher, "quote": quote}  # Add the author and quote to the dictionary with a unique ID

# Print results with the author at the beginning
for key, value in data.items():
    print(f"ID: {key}, Author: {value['author']}, Quote: {value['quote']}")

ID: 1, Author: aristotle, Quote: Be a free thinker and don't accept everything you hear as truth. Be critical and evaluate what you believe in.
ID: 2, Author: aristotle, Quote: Excellence is never an accident. It is always the result of high intention, sincere effort, and intelligent execution; it represents the wise choice of many alternatives - choice, not chance, determines your destiny.
ID: 3, Author: aristotle, Quote: To appreciate the beauty of a snowflake, it is necessary to stand out in the cold.
ID: 4, Author: aristotle, Quote: He who cannot be a good follower cannot be a good leader.
ID: 5, Author: aristotle, Quote: The weak are always anxious for justice and equality. The strong pay no heed to either.
ID: 6, Author: aristotle, Quote: Courage is the first virtue that makes all other virtues possible.
ID: 7, Author: aristotle, Quote: Laughter is a bodily exercise, precious to Health.
ID: 8, Author: aristotle, Quote: Greatness of spirit is accompanied by simplicity and sincerit

<span style="color:orange">Step 3: STATISTICAL FILTERING OF OUTLIERS

In [11]:
# Calculation of the mean and standard deviation of the quotes' lengths
lengthList = [len(quote["quote"]) for quote in data.values()]  # Create a list of lengths of all quotes in the dictionary
meanLength = stat.mean(lengthList)  # Calculate the mean length of the quotes
standardLength = stat.stdev(lengthList)  # Calculate the standard deviation of the quotes' lengths

# Definition of the allowed range
tolerance = 1.11  # Define a tolerance level to determine the range of acceptable quote lengths
minLength = meanLength - tolerance * standardLength  # Calculate the minimum length of quotes to be considered
maxLength = meanLength + tolerance * standardLength  # Calculate the maximum length of quotes to be considered

# Filtering of quotes that fall within the allowed range
filteredQuotes = [quote for quote in data.values() if minLength <= len(quote["quote"]) <= maxLength]  # Filter quotes based on their length
# Print results
print(f"Mean Length: {meanLength}")  # Print the mean length of the quotes
print(f"Standard Deviation: {standardLength}")  # Print the standard deviation of the quotes' lengths
print(f"Number of original quotes: {len(data)}")  # Print the total number of original quotes
print(f"Number of allowed quotes: {len(filteredQuotes)}")  # Print the number of quotes that fall within the allowed length range
print(f"Number of filtered quotes: {len(filteredQuotes)}")  # Print the number of quotes that have been filtered

# Print the filtered quotes
print("\nFiltered Quotes:")
for i, quote in enumerate(filteredQuotes, 1):
    print(f"{i}. Author: {quote['author']}, Quote: {quote['quote']}")  # Print each filtered quote along with its author

Mean Length: 122.70196078431373
Standard Deviation: 96.96864342177864
Number of original quotes: 2550
Number of allowed quotes: 2262
Number of filtered quotes: 2262

Filtered Quotes:
1. Author: aristotle, Quote: Be a free thinker and don't accept everything you hear as truth. Be critical and evaluate what you believe in.
2. Author: aristotle, Quote: Excellence is never an accident. It is always the result of high intention, sincere effort, and intelligent execution; it represents the wise choice of many alternatives - choice, not chance, determines your destiny.
3. Author: aristotle, Quote: To appreciate the beauty of a snowflake, it is necessary to stand out in the cold.
4. Author: aristotle, Quote: He who cannot be a good follower cannot be a good leader.
5. Author: aristotle, Quote: The weak are always anxious for justice and equality. The strong pay no heed to either.
6. Author: aristotle, Quote: Courage is the first virtue that makes all other virtues possible.
7. Author: aristotl

<span style="color:orange">Step 4:LOWERCASING OF FILTERED QUOTES

In [12]:
# Application of lowercasing to the filtered quotes
for quote in filteredQuotes:
    quote["quote"] = quote["quote"].lower()

print("Sample of quotes after lowercasing:")
for i, quote in enumerate(filteredQuotes[:5]):  # Display the first 5 quotes
    print(f"Author: {quote['author']}, Quote {i + 1}: {quote['quote']}")

Sample of quotes after lowercasing:
Author: aristotle, Quote 1: be a free thinker and don't accept everything you hear as truth. be critical and evaluate what you believe in.
Author: aristotle, Quote 2: excellence is never an accident. it is always the result of high intention, sincere effort, and intelligent execution; it represents the wise choice of many alternatives - choice, not chance, determines your destiny.
Author: aristotle, Quote 3: to appreciate the beauty of a snowflake, it is necessary to stand out in the cold.
Author: aristotle, Quote 4: he who cannot be a good follower cannot be a good leader.
Author: aristotle, Quote 5: the weak are always anxious for justice and equality. the strong pay no heed to either.


<span style="color:orange">Step 5:CLEANING MULTIPLE SPACES

In [13]:
# Removal of multiple spaces in the filtered and lowercased quotes
for quote in filteredQuotes:
    quote["quote"] = re.sub(r"\s+", " ", quote["quote"]).strip()  # Remove multiple spaces

# Print a sample for verification
print("Sample of quotes after removing multiple spaces:")
for i, quote in enumerate(filteredQuotes[:5]):  # Display the first 5 quotes
    print(f"Quote {i + 1}: Author: {quote['author']}, Quote: {quote['quote']}")

Sample of quotes after removing multiple spaces:
Quote 1: Author: aristotle, Quote: be a free thinker and don't accept everything you hear as truth. be critical and evaluate what you believe in.
Quote 2: Author: aristotle, Quote: excellence is never an accident. it is always the result of high intention, sincere effort, and intelligent execution; it represents the wise choice of many alternatives - choice, not chance, determines your destiny.
Quote 3: Author: aristotle, Quote: to appreciate the beauty of a snowflake, it is necessary to stand out in the cold.
Quote 4: Author: aristotle, Quote: he who cannot be a good follower cannot be a good leader.
Quote 5: Author: aristotle, Quote: the weak are always anxious for justice and equality. the strong pay no heed to either.


<span style="color:orange">Step 6:ADDING [Author] AT THE BEGINNING

In [14]:
# Adding [Author] at the beginning of each quote
for quote in filteredQuotes:
    author = quote["author"]
    text = quote["quote"]
    quote["quote"] = f"[{author}] {text}"

# Print a sample for verification
print("Sample of quotes after adding [Author]:")
for i, quote in enumerate(filteredQuotes[:5]):  # Display the first 5 quotes
    print(f"Quote {i + 1}: Author: {quote['author']}, Quote: {quote['quote']}")

Sample of quotes after adding [Author]:
Quote 1: Author: aristotle, Quote: [aristotle] be a free thinker and don't accept everything you hear as truth. be critical and evaluate what you believe in.
Quote 2: Author: aristotle, Quote: [aristotle] excellence is never an accident. it is always the result of high intention, sincere effort, and intelligent execution; it represents the wise choice of many alternatives - choice, not chance, determines your destiny.
Quote 3: Author: aristotle, Quote: [aristotle] to appreciate the beauty of a snowflake, it is necessary to stand out in the cold.
Quote 4: Author: aristotle, Quote: [aristotle] he who cannot be a good follower cannot be a good leader.
Quote 5: Author: aristotle, Quote: [aristotle] the weak are always anxious for justice and equality. the strong pay no heed to either.


<span style="color:orange">STEP 7: CREATION OF TRAINING AND EVALUATION DATASETS

In [15]:
# Splitting the quotes based on the author
authors = set(quote["author"] for quote in filteredQuotes)  # Create a set of unique authors
train_data, eval_data = [], []  # Initialize lists to store training and evaluation data

# Calculation of the number of quotes to include in the evaluation dataset for each author
total_quotes = len(filteredQuotes)  # Total number of filtered quotes
eval_size = int(total_quotes * 0.2)  # 20% of the total dataset for evaluation
quotes_per_author = eval_size // len(authors)  # Ensure an equal number of quotes for each author in the evaluation set

# Distributing quotes into train and evaluation datasets
for author in authors:
    author_quotes = [quote for quote in filteredQuotes if quote["author"] == author]  # Collect quotes for the current author
    train, eval_ = train_test_split(author_quotes, test_size=quotes_per_author, random_state=42)  # Split quotes into train and eval sets
    train_data.extend(train)  # Add train quotes to the overall train dataset
    eval_data.extend(eval_)  # Add evaluation quotes to the overall eval dataset

# Print the size of the training and evaluation datasets
print(f"Train dataset size: {len(train_data)}")
print(f"Eval dataset size: {len(eval_data)}")

Train dataset size: 1812
Eval dataset size: 450


<span style="color:orange">Step 8: SAVING AND LOADING DATA FOR TRAINING AND EVALUATION DATASETS

In [16]:
def save_to_txt(data, path):
    with open(path, "w", encoding="utf-8") as file:  # Open a file at the specified path for writing with UTF-8 encoding
        for quote in data:
            file.write(f"[{quote['author']}] {quote['quote']}\n")  # Write each quote prefixed with the author's name

train_path = r"C:\Users\Kosta\Python_Projects\JupyterProject\Dataset\PData\Training\train.txt"  # Define the path for the training dataset
eval_path = r"C:\Users\Kosta\Python_Projects\JupyterProject\Dataset\PData\Training\eval.txt"  # Define the path for the evaluation dataset

save_to_txt(train_data, train_path)  # Save the training data to the specified train_path
save_to_txt(eval_data, eval_path)  # Save the evaluation data to the specified eval_path

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Kosta\\Python_Projects\\JupyterProject\\Dataset\\PData\\Training\\train.txt'

In [53]:
# Save to CSV
def save_to_csv(data, path):
    with open(path, "w", newline="", encoding="utf-8") as file:  # Open the file in write mode with UTF-8 encoding
        writer = csv.writer(file)  # Create a writer object for CSV files
        writer.writerow(["Author", "Quote"])  # Write the header row with column names
        for quote in data:
            writer.writerow([quote["author"], quote["quote"]])  # Write each quote and its author as a row in the CSV file

In [54]:
# Save to JSON
def save_to_json(data, path):
    with open(path, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)  # Save the data to a JSON file using UTF-8 encoding, ensuring non-ASCII characters are preserved, and formatting the JSON with an indentation of 4 spaces for better readability.

In [55]:
# Definition of paths for CSV and JSON
train_csv_path = r"C:\Users\Kosta\Python_Projects\JupyterProject\Dataset\PData\CSV\train.csv"  # Path for the training CSV file
eval_csv_path = r"C:\Users\Kosta\Python_Projects\JupyterProject\Dataset\PData\CSV\eval.csv"   # Path for the evaluation CSV file
train_json_path = r"C:\Users\Kosta\Python_Projects\JupyterProject\Dataset\PData\JSON\train.json"  # Path for the training JSON file
eval_json_path = r"C:\Users\Kosta\Python_Projects\JupyterProject\Dataset\PData\JSON\eval.json"   # Path for the evaluation JSON file

# Save the data to CSV and JSON files
save_to_csv(train_data, train_csv_path)  # Save the training data to the specified train_csv_path
save_to_csv(eval_data, eval_csv_path)   # Save the evaluation data to the specified eval_csv_path
save_to_json(train_data, train_json_path) # Save the training data to the specified train_json_path
save_to_json(eval_data, eval_json_path)  # Save the evaluation data to the specified eval_json_path

print(f"Data has been saved to CSV and JSON files.")  # Print a confirmation message

Data has been saved to CSV and JSON files.


# <span style="color:cyan"> IMPLEMENTATION AND FINE-TUNING FOR PHILOSOPHER QUOTES


<span style="color:orange">LOADING THE GPT-2 MODEL AND TOKENIZER

In [56]:
# Load the GPT-2 tokenizer pre-trained on a large corpus
tokenizer = TFGPT2LMHeadModel.from_pretrained("gpt2")

# Load the GPT-2 model pre-trained for language modeling tasks
model = TFGPT2LMHeadModel.from_pretrained("gpt2")

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


<span style="color:orange">LOADING THE TRAINING DATASET

In [61]:
# Load the training dataset using the GPT-2 tokenizer
train_dataset = TextDataset(
    tokenizer=tokenizer,  # Use the GPT-2 tokenizer
    file_path=train_path,  # Specify the file path for the training data
    block_size=64)  # Define the block size for the text blocks

# Load the evaluation dataset using the GPT-2 tokenizer
eval_dataset = TextDataset(
    tokenizer=tokenizer,  # Use the GPT-2 tokenizer
    file_path=eval_path,  # Specify the file path for the evaluation data
    block_size=64)  # Define the block size for the text blocks

# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,  # Use the GPT-2 tokenizer
    mlm=False)  # Disable masked language modeling

ImportError: 
TextDataset requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFTextDataset".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


<span style="color:orange">TRAINING SETTINGS

In [62]:
training_args = TrainingArguments(
    output_dir=r"..\Dataset\PData\Training\philosopher-gpt2",  # Set the output directory for the model checkpoints
    overwrite_output_dir=True,  # Overwrite the output directory if it already exists
    num_train_epochs=10,  # Increase the number of training epochs
    per_device_train_batch_size=4,  # Increase the batch size per device
    save_steps=500,  # Save the model every 500 steps
    save_total_limit=2,  # Keep only the last 2 saved checkpoints
    logging_steps=50,  # Log the training progress every 50 steps
    eval_strategy="steps",  # Evaluate the model every 'steps'
    eval_steps=500,  # Evaluate the model every 500 steps
    prediction_loss_only=True,  # Log only the prediction loss during evaluation
    learning_rate=5e-5,  # Reduce the learning rate
    weight_decay=0.01,  # Add weight decay for regularization
    warmup_steps=500,  # Add warmup steps to the learning rate scheduler
    gradient_accumulation_steps=2,  # Enable gradient accumulation to effectively increase the batch size
    fp16=True,)  # Enable mixed precision training if the GPU supports it

<span style="color:orange">TRAINING THE MODEL

In [59]:
# Initialize the Trainer object for training the model
trainer = Trainer(
    model=model,  # The GPT-2 model to be trained
    args=training_args,  # Training arguments defined earlier (e.g., epochs, batch size, etc.)
    train_dataset=train_dataset,  # Dataset used for training
    eval_dataset=eval_dataset,  # Dataset used for evaluation during training
    data_collator=data_collator)  # Data collator for batching and processing the data

# Start the training process
trainer.train()

NameError: name 'train_dataset' is not defined

<span style="color:orange">SAVING THE FINE-TUNED MODEL AND TOKENIZER

In [None]:
# Save the fine-tuned model
trainer.save_model(r"..\Dataset\PData\Training\philosopher-gpt2")

# Save the tokenizer
tokenizer.save_pretrained(r"..\Dataset\PData\Training\philosopher-gpt2")

<span style="color:orange">EVALUATION OF THE MODEL

In [None]:
# Evaluate the model using the evaluation dataset
eval_results = trainer.evaluate()  # This function runs the evaluation process and returns metrics like loss, accuracy, etc.

# Print the evaluation results
print(f"Evaluation results: {eval_results}")  # Display the evaluation metrics to assess the model's performance

<span style="color:orange">GENERATION OF THE PHILOSOPHICAL QUOTES

In [None]:
# Load the fine-tuned model for text generation
generator = pipeline("text-generation", model=r"..\Dataset\PData\Training\philosopher-gpt2", tokenizer=tokenizer)

# Define a function to generate a quote based on a philosopher and a topic
def generate_quote(philosopher, topic):
    prompt = f"[{philosopher}] {topic}"  # Create a prompt combining the philosopher and topic
    result = generator(prompt, max_length=50, num_return_sequences=1)  # Generate text using the model
    full_text = result[0]["generated_text"]  # Extract the generated text
    # Extract only the first sentence (based on the first period)
    first_sentence = full_text.split('.')[0] + '.'  # Split the text and return the first sentence
    return first_sentence  # Return the generated sentence

# Create a user interface using Gradio
iface = gr.Interface(
    fn=generate_quote,  # The function to call when generating a quote
    inputs=["text", "text"],  # Input fields: philosopher and topic
    outputs=["text"],  # Output field: generated quote
    title="Philosopher Quote Generator",  # Title of the interface
    description="Enter a philosopher and a topic to generate a new quote."  # Description of the interface
)

# Launch the interface
iface.launch()