In [None]:
import PyPDF2
import openai
import re 
import os 
import nltk
from nltk.tokenize import sent_tokenize


### Section 1: Prepare the data

In [None]:
# Download the 'punkt' tokenizer from NLTK
nltk.download('punkt')

In [None]:
with open('../api.key', 'r') as file:
    api_key = file.read().strip()

openai.api_key = api_key

In [None]:
def remove_newlines(serie):
    serie = serie.str.replace('\n', ' ')
    serie = serie.str.replace('\\n', ' ')
    serie = serie.str.replace('  ', ' ')
    serie = serie.str.replace('  ', ' ')
    return serie

def preprocess(text):
    text = text.replace('\n', ' ')
    text = re.sub('\s+', ' ', text)

    # Remove any extra spaces
    text = ' '.join(text.split())
    
    return text

In [None]:
path = "C:\Dropbox\Desktop\MAchineLearning\openai\embedding_littleprince\\thelittleprince.pdf" 
start_page = 1
num_pages = 0

sentences_list = []

reader = PyPDF2.PdfReader(path)

num_pages = len(reader.pages)
print(num_pages)


In [None]:
with open("thelittleprince.txt", "w") as f:
    for i in range(start_page-1, num_pages):
        page = reader.pages[i]
        text = page.extract_text() 
        text = preprocess(text) 
        f.write(text + "\n")

In [50]:
# Read the text file
with open("thelittleprince.txt", "r") as f:
    text = f.read()

# Split the text into sentences
sentences = sent_tokenize(text)

In [51]:
# Create the 'sentences' folder if it doesn't exist
if not os.path.exists("sentences"):
    os.makedirs("sentences")

# Save x sentences in a separate file
num_sentences_per_file = 15

for i in range(0, len(sentences), num_sentences_per_file):
    filename = f"sentence_{i // num_sentences_per_file:04d}.txt"
    filepath = os.path.join("sentences", filename)
    
    with open(filepath, "w") as f:
        for j in range(i, min(i + num_sentences_per_file, len(sentences))):
            f.write(sentences[j])
            if j != min(i + num_sentences_per_file, len(sentences)) - 1:
                f.write('\n')


In [52]:
import pandas as pd

# Create a list to store the text files
texts=[]

# Get all the text files in the text directory
for file in os.listdir("sentences"):
    #print(file)

    # Open the file and read the text
    with open("sentences/" + file, "r", encoding="ISO-8859-1") as f:
        text = f.read()

        # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
        texts.append((file[0:-4], text))

# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns = ['fname', 'text'])

# Set the text column to be the raw text with the newlines removed
df['text'] = df.fname + ". " + remove_newlines(df.text)
df.to_csv('processed/scraped.csv')
#df.head()

  serie = serie.str.replace('\\n', ' ')


Unnamed: 0,fname,text
0,sentence_0000,sentence_0000. 1!!!!! ! ! !The Little Prince w...
1,sentence_0001,"sentence_0001. ""#$%&'()*+,- /.+(*+,#+$:+/.43-#..."
2,sentence_0002,sentence_0002. My Drawing Number One. It looke...
3,sentence_0003,"sentence_0003. If one gets lost in the night, ..."
4,sentence_0004,sentence_0004. It was a question of life or de...


### Section 2: Create embeddings

In [53]:
import tiktoken

# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv('processed/scraped.csv', index_col=0)
df.columns = ['title', 'text']

# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

# Visualize the distribution of the number of tokens per row using a histogram
#df.n_tokens.hist()

In [54]:
max_tokens = 1800

# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):

    # Split the text into sentences
    sentences = text.split('. ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks

In [55]:
shortened = []

# Loop through the dataframe
for row in df.iterrows():

    # If the text is None, go to the next row
    if row[1]['text'] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]['n_tokens'] > max_tokens:
        shortened += split_into_many(row[1]['text'])
    
    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append( row[1]['text'] )

In [56]:
df = pd.DataFrame(shortened, columns = ['text'])
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

In [57]:
from openai.embeddings_utils import distances_from_embeddings
import csv 
import pandas as pd 

embeddings_path = 'processed/embeddings.csv'

if os.path.exists(embeddings_path):
    df=pd.read_csv(embeddings_path, index_col=0)
    df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

else:
    df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
    df.to_csv(embeddings_path)
    #df.head()

### Section 3: Ask questions

In [58]:
def create_context(question, df, max_len=1800, size="ada"):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')


    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        
        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4
        
        # If the context is too long, break
        if cur_len > max_len:
            break
        
        # Else add it to the text that is being returned
        returns.append(row["text"])

    # Return the context
    return "\n\n###\n\n".join(returns)

In [59]:
def answer_question(
    df,
    model="text-davinci-003",
    question="Provide a summary of the content.",
    max_len=1800,
    size="ada",
    debug=False,
    max_tokens=150,
    stop_sequence=None):
    
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(
        question,
        df,
        max_len=max_len,
        size=size,
    )
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a completions using the question and context
        response = openai.Completion.create(
            prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
            model=model,
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(e)
        return ""

In [60]:
answer_question(df, question="Who wrote this book?", debug=False)

'The book was written and illustrated by Antoine de Saint Exupéry.'

In [61]:
answer_question(df, question="What is this book about?", debug=False)

'This book is about the Little Prince and his adventures as he travels around the universe.'

In [62]:
answer_question(df, question="Waht do astronomers do when they discover a planet?", debug=False)

'They give it a number, such as "Asteroid 325".'

In [63]:
answer_question(df, question="What did the little prince do in the morning of his departure?", debug=False)

'He said goodbye to the fox and took his leave.'

In [64]:
answer_question(df, question="What animales and flowers does the Little Prince meet?", debug=False)

'The Little Prince meets a flower with three petals, a fox, and roses.'

In [65]:
answer_question(df, question="What lessons he learns from the fox?", debug=False)

'The fox teaches the little prince that "It is only with the heart that one can see rightly; what is essential is invisible to the eye" and that "You become responsible, forever, for what you have tamed. You are responsible for your rose."'

In [66]:
answer_question(df, question="What lessons he learns from the flower with three petals?", debug=False)

"I don't know."

In [67]:
answer_question(df, question="What is the story with the flower with three petals?", debug=False)

'The little prince crossed the desert and met with only one flower. It was a flower with three petals, a flower of no account at all. "Good morning," said the little prince. "Good morning," said the flower. "Where are the men?" the little prince asked, politely. The flower had once seen a caravan passing. "Men?" she echoed. "I think there are six or seven of them in existence. I saw them, several years ago. But one never knows where to find them. The wind blows them away.'

In [68]:
answer_question(df, question="Anything to say about the flower with three petals?", debug=False)

"The little prince was surprised by the flower's coquettishness and her mysterious adornment. He was also amazed that she asked for a screen to protect her from drafts. She eventually told the little prince that she loved him and asked for his forgiveness."

In [69]:
answer_question(df, question="Please describe the Little Prince's planet", debug=False)

"The Little Prince's planet was very small, scarcely any larger than a house. It had no people, no oceans, no mountains, no towns, no rivers, and no deserts."

In [70]:
answer_question(df, question="What can we learn from this book? please give me one lesson for me.", debug=False)

'One lesson from this book is that true friendship cannot be bought from a shop, and it must be tamed with patience and understanding.'