In [None]:
# !pip install pandas plotly numpy scipy scikit-learn PyPDF2 termcolor python-dotenv

# Support functions
## Functions:

    prettyPrint(text): Prints the input text in a pretty format with a maximum width of 100 characters per line

    get_response(messages): Sends a message to OpenAI's GPT-3 model with the provided messages and returns the response

    get_answer(user_query, combined_text): Combines the user's question and the provided text and sends it to OpenAI's GPT-3 model to get a short and precise answer in bullet points, then returns the answer

    get_answer_stream(user_query, combined_text): Continuously streams the response from OpenAI's GPT-3 model for the provided user query and text combination in bullet points
    
## Classes:
    
    aiSummarizer: A class that initializes an OpenAI API key and a GPT-3 model, preprocesses text by removing unnecessary characters and stop words, and summarizes input text with the GPT-3 model by splitting the input text into chunks and sending each chunk as a message to the GPT-3 model to get a summarized response 

In [None]:
import textwrap
import openai
import pandas as pd
import numpy as np
from getpass import getpass
import sklearn
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import pandas as pd
from PyPDF2 import PdfReader
import re
from IPython.display import Markdown
from nltk.tokenize import sent_tokenize
import os
from dotenv import load_dotenv

load_dotenv()
openai.api_key  = os.getenv('API_KEY')



def prettyPrint(text):
  # wrap the text to a maximum width of 20 characters
  wrapped_lines = textwrap.wrap(text, width=100)

  # print each wrapped line
  for line in wrapped_lines:
      print(line)
      
      
import re
import textwrap
import openai
from nltk.corpus import stopwords

class aiSummarizer:
    
    def __init__(self):
        self.openai_api_key = os.getenv('API_KEY')
        openai.api_key = self.openai_api_key
        self.openai_model = "gpt-3.5-turbo"
        self.stop_words = set(stopwords.words('english'))
        self.summary = ""
        

    def preprocess_text(self, text):
        # Remove any citations, parentheses, and brackets
        text = re.sub(r"\[[^\]]*\]", "", text)
        text = re.sub(r"\([^\)]*\)", "", text)
        text = re.sub(r"\{[^\}]*\}", "", text)

        # Remove any extra white space
        text = " ".join(text.split())

        # Remove stop words
        words = text.split()
        words = [word for word in words if word.lower() not in self.stop_words]
        text = " ".join(words)
        return text
    
    def summarize_text(self, text, prompt="Summarize, keep it short but capture all the important points of the text below:", max_length=500):
        text_chunks = self.split_text(text, max_length=4000)
        summarized_text = ""

        for i, chunk in enumerate(text_chunks):
            print(f"Summarizing chunk {i+1} of {len(text_chunks)}")
            messages = [
              {"role": "system", "content": "You are a super intelligence AI assistant"},
              {"role": "user", "content": prompt + chunk}
            ]
            response = openai.ChatCompletion.create(
                model=self.openai_model,
                messages=messages,
                temperature=0
            )
            summarized_text += response['choices'][0]['message']['content'] 
        print("Summarizing process completed")
        return summarized_text
    
    def split_text(self, text, max_length):
        if len(text) <= max_length:
            return [text]

        chunks = textwrap.wrap(text, max_length)
        return chunks

# # Initialize the AIAssistant
# aiSummarizer = aiSummarizer()


# # Summarize the input text
# summary = aiSummarizer.summarize_text(text)
# print(len(summary))
# prettyPrint(summary)

#Use the user query and combined text to get the answer:
def get_response(messages):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=2000
    )
    return response

def get_answer(user_query, combined_text):
    messages = [
          {"role": "system", "content": "You are a super intelligence AI assistant"},
          {"role": "user", "content": f"Question: {user_query}, context: {combined_text}. \
              Keep the answer short and precise. DO NOT adjust the original question. Show both question and answer in Markdown format, bullet points."},
        ]
    response = get_response(messages)
    return response['choices'][0]['message']['content']

#Use the user query and combined text to get the answer:
def get_response(messages):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=2000
    )
    return response

def get_answer(user_query, combined_text):
    messages = [
          {"role": "user", "content": f"Question: {user_query}, context: {combined_text}. \
              Keep the answer short and precise. DO NOT adjust the original question. Show both question and answer in Markdown format, bullet points."},
        ]
    response = get_response(messages)
    return response['choices'][0]['message']['content']

from termcolor import colored

def get_answer_stream(user_query, combined_text):
    for chunk in openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages = [
          
          {"role": "user", "content": f""" This is a program let the user ask questions about the file.
                user question: {user_query}?, file: {combined_text}.
                Do not change the question. Keep answer short and precise, cite your source. Show both question and answer in Markdown format, use bullet points in answer
                    """},
        ],
        stream=True,
    ):
        content = chunk["choices"][0].get("delta", {}).get("content")
        if content is not None:
            print(colored(content, 'white'), end='')
        

# PDF to embedding
    The pdf_embed_df() function takes a PDF file name as input and returns a dataframe with text chunks and their embeddings.
        The function reads the PDF file and extracts its text using PyPDF2.
        The extracted text is split into chunks of 500 characters and stored in a dataframe.
        The function saves the processed text and embeddings to separate CSV files.
        The function uses OpenAI's text-embedding-ada-002 model to generate embeddings for the text chunks.

In [93]:


def pdf_embed_df():
    # Ask the user to input the PDF file name
    pdf_file = input("Enter the PDF file name: ")

    print(f"Reading PDF file: {pdf_file}")
    # Read the PDF file using PyPDF2
    inputpdf = PdfReader(open(pdf_file, "rb"))

    pdf_text = ''  # Initialize an empty string to store the PDF text
    # Loop over each page of the PDF file
    for i in range(len(inputpdf.pages)):
        text = inputpdf.pages[i].extract_text()  # Extract the text from the current page
        pdf_text += text  # Append the cleaned text to the `pdf_text` string
    # print(f"PDF text extracted successfully: {pdf_text}")
    
    # Split the text into chunks of 500 characters
    chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)]

    # Create a dataframe with the chunks
    df = pd.DataFrame(chunks, columns=['text'])

    # Remove the ".pdf" extension from the original file name
    file_name = pdf_file.split(".")[0]
    
    # Add a new column 'length' to the DataFrame
    df['length'] = df['text'].str.len()

    # Save the DataFrame to a CSV file with "processed_" prepended to the original file name
    processed_filename = f"processed_{file_name}.csv"
    print(f"Saving processed text as CSV file: {processed_filename}")
    df.to_csv(processed_filename, index=False)

    # Embed the text using OpenAI's text-embedding-ada-002 model
    print("Embedding text using OpenAI's text-embedding-ada-002 model...")
    df['embedding'] = df['text'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))

    # Save the DataFrame to a CSV file with "processed_" prepended to the original file name
    embeddings_filename = f"processed_{file_name}_embeddings.csv"
    print(f"Saving text embeddings as CSV file: {embeddings_filename}")
    df.to_csv(embeddings_filename, index=False)

    return df

df = pdf_embed_df()
display(df)

Reading PDF file: tesla.pdf
Saving processed text as CSV file: processed_tesla.csv
Embedding text using OpenAI's text-embedding-ada-002 model...
Saving text embeddings as CSV file: processed_tesla_embeddings.csv


Unnamed: 0,text,length,embedding
0,Q4 and FY 2022 Update\n1Highlights 03\nFinanci...,500,"[-0.003069573547691107, -0.016441890969872475,..."
1,6.0% in Q4\n$13.7B GAAP operating income in 20...,500,"[-0.022131500765681267, -0.023385969921946526,..."
2,that there are questions about the near -\nte...,500,"[-0.03230626508593559, -0.02757885493338108, 0..."
3,many years. \nImproving affordability is neces...,500,"[-0.008768563158810139, 0.0005116421962156892,..."
4,g \nfocused on the long -term potential of aut...,500,"[-0.003139789216220379, -0.019092140719294548,..."
...,...,...,...
76,ing our products and features cost-effectively...,500,"[0.00832277536392212, -0.0011817450867965817, ..."
77,at Gigafactory Nevada and Gigafactory Shanghai...,500,"[0.012574231252074242, -0.026666156947612762, ..."
78,to maintain public credibility and confidence ...,500,"[0.009611119516193867, -0.02292659506201744, 0..."
79,s \nand laws applicable to our operations and ...,500,"[-0.006075656041502953, -0.018671853467822075,..."


# Run program
    Prompt user for a search query
    Embed the query using OpenAI's text-embedding-ada-002 model
    Calculate cosine similarity between the query vector and each text block in the preprocessed PDF
    Sort the blocks by similarity and combine the top 5 most similar blocks
    If the length of the combined text is greater than 3500, summarize it using OpenAI's GPT-3.5 model
    Print the answer to the user in a stream format using Markdown format

In [107]:
# # Replace 'file_name.csv' with the path of your actual CSV file
# csv_file = 'processed_tesla_embeddings.csv'

# # Read the CSV file into a DataFrame
# df = pd.read_csv(csv_file)
display(df)


# Initialize the AIAssistant
ai_summarizer = aiSummarizer()

        
def get_answer_stream(user_query, combined_text):
    for chunk in openai.ChatCompletion.create(
        model="gpt-4", #gpt-3.5-turbo
        messages = [
          
          {"role": "user", "content": f"""Follow the instruction carefully.
            This is a program let the user ask questions about their file. File: {combined_text}\n User question: {user_query}? \
              Only answer what asked, do not change the question. 
              Keep answer short and precise, cite the source. 
              Show both question and answer in Markdown format.
              Use bullet points for answer: \
                
                """
                },
        ],
        stream=True,
    ):
        content = chunk["choices"][0].get("delta", {}).get("content")
        if content is not None:
            print(colored(content, 'green'), end='')
        
        
while True:
    # Prompt user for search query
    user_query = input("Search earnings for a sentence (Press 'esc' to quit): ")
    
    # If user leaves input empty or hits "esc" key, break out of the loop
    if not user_query or user_query == '\x1b':
        break
    
    user_query_vector = get_embedding(user_query, engine="text-embedding-ada-002")

    # Do similarity search
    df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, user_query_vector))
    df = df.sort_values("similarities", ascending=False)

    # Display search results
    # display(df.head())

    # Combine the text from the 3 best matches
    top_blocks = df.head(4)['text'].tolist()
    combined_text = ' '.join(top_blocks)
    length = len(combined_text)  
    # print(f"Length of original text: {length}")
    # prettyPrint(combined_text)

    # If length of the combined text is greater than 3500, then summarize it
    if length > 3500:
        print("Summarizing combined text...")

        # Summarize the input text
        summary = ai_summarizer.summarize_text(combined_text)
    else:
        summary = combined_text
    # #Get answer streaming:
    # get_answer_stream(user_query, combined_text)
    
    # Get the answer markdown 
    # answer = get_answer(user_query, summary)

    # Display the answer in Markdown format
    # display(Markdown(answer))
    get_answer_stream(user_query, summary)
    print("\n")


Unnamed: 0,text,length,embedding,similarities
41,"8 621 599 \nTotal automotive revenue 15,967 16...",500,"[0.004110813606530428, -0.022544672712683678, ...",0.807902
42,"1,579 1,605 \nTotal cost of revenues 12,872 13...",500,"[-0.018536904826760292, -0.009518403559923172,...",0.796155
59,"tion and impairment 1,901 2,154 2,322 2,911 3,...",500,"[-0.016233323141932487, -0.003794889198616147,...",0.767976
11,"2,101) (1,327) (3,157) (6,482) (7,158) 10%\nFr...",500,"[-0.007092054933309555, -0.02074492536485195, ...",0.761009
1,6.0% in Q4\n$13.7B GAAP operating income in 20...,500,"[-0.022131500765681267, -0.023385969921946526,...",0.760369
...,...,...,...,...
77,at Gigafactory Nevada and Gigafactory Shanghai...,500,"[0.012574231252074242, -0.026666156947612762, ...",0.681176
23,house 4680 cells in a single \nweek to make ov...,500,"[0.007835561409592628, -0.026911381632089615, ...",0.679665
25,and Full Self -Driving (FSD)\nWe have now rel...,500,"[0.015706012025475502, -0.017201192677021027, ...",0.672717
75,es. These forward -looking statements are base...,500,"[-0.0066046989522874355, -0.02313307486474514,...",0.671296


[32m**[0m[32mUser[0m[32m Question[0m[32m:**[0m[32m What[0m[32m is[0m[32m Tesla[0m[32m's[0m[32m current[0m[32m capacity[0m[32m?

[0m[32m**[0m[32mAnswer[0m[32m:[0m[32m**
[0m[32m-[0m[32m California[0m[32m Model[0m[32m S[0m[32m /[0m[32m Model[0m[32m X[0m[32m:[0m[32m [0m[32m100[0m[32m,[0m[32m000[0m[32m
[0m[32m-[0m[32m California[0m[32m Model[0m[32m [0m[32m3[0m[32m /[0m[32m Model[0m[32m Y[0m[32m:[0m[32m [0m[32m550[0m[32m,[0m[32m000[0m[32m
[0m[32m-[0m[32m Shanghai[0m[32m Model[0m[32m [0m[32m3[0m[32m /[0m[32m Model[0m[32m Y[0m[32m:[0m[32m >[0m[32m750[0m[32m,[0m[32m000[0m[32m
[0m[32m-[0m[32m Berlin[0m[32m Model[0m[32m Y[0m[32m:[0m[32m >[0m[32m250[0m[32m,[0m[32m000[0m[32m
[0m[32m-[0m[32m Texas[0m[32m Model[0m[32m Y[0m[32m:[0m[32m >[0m[32m250[0m[32m,[0m[32m000[0m[32m
[0m[32m-[0m[32m Cyber[0m[32mtruck[0m[32m,[0m[32m Tesla[0m[32m Semi[0m