In [19]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import numpy as np
from collections import deque

In [20]:
# Load GPT-2 pre-trained model and tokenizer from Hugging Face
model_name = "gpt2"  # Using the smaller model (gpt2)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Initialize tokenizer with padding on the left side
tokenizer = GPT2Tokenizer.from_pretrained(model_name, padding_side='left')

# Set pad_token to eos_token if not set already
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Set the model to evaluation mode (this disables dropout and other training behaviors)
model.eval()

Using pad_token, but it is not set yet.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [21]:
# Initialize the cache with a maximum size
cache_size = 5  # Maximum number of cache entries
cache = deque(maxlen=cache_size)

In [22]:
# Function to add content to the cache
def add_to_cache(text, cache):
    cache.append(text)

# Function to retrieve relevant content from the cache
def retrieve_from_cache(query, cache, k=3):
    return list(cache)[-k:]

# Modified generate function with attention mask and padding
def generate_with_cache(query, cache, model, tokenizer, max_length=1024, max_new_tokens=50):
    # Retrieve relevant cached information
    relevant_texts = retrieve_from_cache(query, cache)
    
    # Combine the relevant cache text with the current query to form the complete context
    context = " ".join(relevant_texts) + " " + query
    
    # Tokenize the input
    inputs = tokenizer.encode(context, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
    
    # Ensure the attention mask is set
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)
    
    # Ensure the total input length is within the model's max length
    if inputs.shape[1] > max_length:
        inputs = inputs[:, -max_length:]
        attention_mask = attention_mask[:, -max_length:]
    
    # Check if there are any invalid token ids (out of vocab range)
    if torch.any(inputs >= model.config.vocab_size):
        raise ValueError("One or more tokens are out of vocabulary range.")
    
    # Generate the output using GPT-2
    with torch.no_grad():
        outputs = model.generate(inputs, max_new_tokens=max_new_tokens, attention_mask=attention_mask, pad_token_id=tokenizer.pad_token_id, num_return_sequences=1, no_repeat_ngram_size=2)
    
    # Decode the generated response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text

In [23]:
# Adding some example text to the cache
add_to_cache("Artificial intelligence (AI) refers to the simulation of human intelligence in machines.", cache)
add_to_cache("The Turing test is used to evaluate a machine's ability to exhibit intelligent behavior.", cache)

In [29]:
# Improved prompt to get a more relevant answer
query = "Tell me about the history and future of artificial intelligence, its applications, and current advancements."

# Generating response with additional parameters to control quality
outputs = model.generate(input_ids, 
                         max_length=1024,  # Set total length including prompt
                         no_repeat_ngram_size=2,  # Avoid repeating n-grams
                         temperature=0.7,  # Control randomness
                         top_p=0.9,  # Use nucleus sampling for better variety
                         top_k=50,  # Limit next token choices
                         attention_mask=attention_mask, 
                         pad_token_id=tokenizer.pad_token_id)

# Decode and display the result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)



Tell me more about artificial intelligence.

I'm a computer scientist at the University of California, Berkeley. I'm also a professor of computer science at Stanford University. And I've been working on artificial-intelligence for a long time. In fact, I was one of the first to write about it in a paper published in the journal Nature. But I didn't know about the topic until I read the paper. It's a very interesting paper, and I think it's important to understand how it works. The paper is called "The Problem of Artificial Intelligence."
...
 (1) The problem of artificial intelligences is that they are not just machines, but also people. They are people who are able to do things that are impossible. (2) They can do anything that is impossible, even if they have no knowledge of it. This is a problem that we have to solve. We have a lot of problems with artificial intelligent systems. One of them is the problem with the human mind. If you have an artificial brain, you can't do any of the