Interlingua

Hossein Askari-2024


Install required packages

In [None]:
!pip install -r requirements.txt

Checking that all requirements are installed 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import transformers
from pyvis.network import Network

# Print installed versions to verify
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("Matplotlib version:", plt.__version__)
print("PyTorch version:", torch.__version__)
print("Transformers version:", transformers.__version__)


# Check if CUDA is available
cuda_available = torch.cuda.is_available()

print("CUDA Available:", cuda_available)
if cuda_available:
    print("CUDA Version:", torch.version.cuda)
    print("Number of GPUs:", torch.cuda.device_count())
    print("CUDA Device Name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. Please check your installation.")

Checking GPTs functionalities

In [None]:
from transformers import pipeline

# Choose a GPT-Neo model. (can also try "EleutherAI/gpt-neo-2.7B" for larger model)
model_name = "EleutherAI/gpt-neo-1.3B"

# Initialize the pipeline with the chosen model
generator = pipeline('text-generation', model=model_name)

# Define the prompt
prompt = "This is a heartfelt love letter. My dear love, "

# Generate text
outputs = generator(prompt, max_length=100, temperature=0.8, do_sample=True, top_p=0.95)

# Print the generated text
generated_text = outputs[0]["generated_text"]
print(generated_text)


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define the prompt
prompt = "a love letter to my beloved sara"
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text
output = model.generate(input_ids, max_length=300, num_return_sequences=1, no_repeat_ngram_size=2, temperature=0.7)

# Decode and Print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Checking Evaluation model function

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from scipy.spatial.distance import cosine
import os

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to read texts from a folder and return as a list
def read_texts_from_folder(folder_path):
    texts = []
    for filename in sorted(os.listdir(folder_path), key=lambda x: os.path.getmtime(os.path.join(folder_path, x))):
        filepath = os.path.join(folder_path, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            texts.append(file.read())
    return texts

# Convert text to embedding
def text_to_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return embeddings.squeeze().numpy()

# Calculate cosine similarity between two embeddings
def calculate_similarity(embedding1, embedding2):
    embedding1 = np.asarray(embedding1).squeeze()
    embedding2 = np.asarray(embedding2).squeeze()
    return 1 - cosine(embedding1, embedding2)

# Load reference texts from files 
reference_folder = './data/external/LetterRefTrue'  
reference_texts = read_texts_from_folder(reference_folder)
reference_embeddings = [text_to_embedding(text, tokenizer, model) for text in reference_texts]

# Function to evaluate a single text input against reference embeddings
def evaluate_text(text, reference_embeddings, tokenizer, model, threshold=0.78):
    new_embedding = text_to_embedding(text, tokenizer, model)
    similarities = [calculate_similarity(new_embedding, ref_emb) for ref_emb in reference_embeddings]
    avg_similarity = np.mean(similarities)
    classification = 'True' if avg_similarity >= threshold else 'False'
    return classification, avg_similarity

# Example usage with a generated text
generated_text = "generated text here."  # This is an example; replace with generated text
classification, avg_similarity = evaluate_text(generated_text, reference_embeddings, tokenizer, model)
print(f"Generated Text: Average Similarity = {avg_similarity:.4f} -> Classified as {classification}")


Main Training code

In [None]:
import gym
from gym import spaces
import numpy as np
import torch
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline, BertTokenizer, BertModel
from scipy.spatial.distance import cosine

def read_texts_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            texts.append(file.read())
    return texts

# BERT-based text evaluation setup
def text_to_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return embeddings.squeeze().numpy()

def calculate_similarity(embedding1, embedding2):
    embedding1 = np.asarray(embedding1).squeeze()
    embedding2 = np.asarray(embedding2).squeeze()
    return 1 - cosine(embedding1, embedding2)

# Custom Environment for text generation
class TextGenerationEnv(gym.Env):
    """Custom Environment for text generation that follows gym interface"""
    def __init__(self, reference_embeddings, model_type='gpt2', max_length=300, num_prompts=2):
        super(TextGenerationEnv, self).__init__()
        self.model_type = model_type
        # Initialize tokenizer and model appropriately for GPT-2 and GPT-Neo
        if model_type == 'gpt2':
            self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            self.model = GPT2LMHeadModel.from_pretrained('gpt2')
        else:  # GPT-Neo
            self.tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
            self.model = pipeline('text-generation', model="EleutherAI/gpt-neo-1.3B")
        
        self.reference_embeddings = reference_embeddings
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        self.max_length = max_length

        self.action_space = spaces.Discrete(num_prompts)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(768,), dtype=np.float32)
        
        self.prompts = ["A love letter to my beloved", "A letter to my love"]
    def seed(self, seed=None):
        torch.manual_seed(seed)
        np.random.seed(seed)

    def step(self, action):
        prompt = self.prompts[action]
        generated_text = self.generate_text(prompt)
        reward = self.evaluate_text(generated_text)
        observation = text_to_embedding(generated_text, self.bert_tokenizer, self.bert_model)
        done = True

        return observation, reward, done, {}

    def evaluate_text(self, text):
        new_embedding = text_to_embedding(text, self.bert_tokenizer, self.bert_model)
        similarities = [calculate_similarity(new_embedding, ref_emb) for ref_emb in self.reference_embeddings]
        avg_similarity = np.mean(similarities)
        return avg_similarity

    def generate_text(self, prompt):
        if self.model_type == 'gpt2':
            input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
            output = self.model.generate(input_ids, max_length=self.max_length, num_return_sequences=1)
            return self.tokenizer.decode(output[0], skip_special_tokens=True)
        else:  # GPT-Neo
            response = self.model(prompt, max_length=self.max_length)
            return response[0]["generated_text"]

    def reset(self):
        initial_text = "Resetting environment"
        observation = text_to_embedding(initial_text, self.bert_tokenizer, self.bert_model)
        return observation

    def render(self, mode='console'):
        if mode != 'console':
            raise NotImplementedError()
        print("Rendering...")

    def close(self):
        pass

    def save_generated_texts(self, filepath):
        """Saves the generated texts to a file."""
        with open(filepath, 'w', encoding='utf-8') as file:
            for text in self.generated_texts:
                file.write(text + "\n\n")

# Training setup
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
reference_texts = read_texts_from_folder('./data/external/LetterRefTrue')
reference_embeddings = [text_to_embedding(text, bert_tokenizer, bert_model) for text in reference_texts]

env = TextGenerationEnv(reference_embeddings=reference_embeddings, model_type='gpt2')
env = make_vec_env(lambda: env, n_envs=1)

model = PPO("MlpPolicy", env, verbose=1, n_steps=10)
model.learn(total_timesteps=100, progress_bar=True)
single_env = env.envs[0].env

# to save the outpot texts
single_env.save_generated_texts('generated_texts.txt')
