<a href="https://colab.research.google.com/github/ktynski/Marketing_Automations_Notebooks_With_GPT/blob/main/Copy_of_The_Ultimate_AI_Researcher_(Public).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/ading2210/poe-api.git

In [None]:
!pip3 install poe-api
!pip install pdfx
!pip install PyPDF2
!pip install arxiv
!pip install transformers
!pip install openai

In [None]:
import os
import urllib.request
import poe
import json
import PyPDF2
import pandas as pd
import openai
import arxiv
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
from transformers import GPT2Tokenizer
import time

# Define your OpenAI GPT-3 API key
openai.api_key = "your API key"

# Function to convert PDF to text
def convert_pdf_to_text(pdf_path):
    text = ''

    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)

        for page in reader.pages:
            text += page.extract_text()

    return text


def truncate_string_to_tokens(content, max_tokens):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    content_tokens = tokenizer.tokenize(content)

    if len(content_tokens) > max_tokens:
        content_tokens = content_tokens[:max_tokens]

    truncated_content = tokenizer.convert_tokens_to_string(content_tokens)
    print("Truncated text")
    truncated_content = str(truncated_content)
    print(truncated_content)
    return truncated_content


# Function to call GPT-3 and get a response
def gpt4_response(prompt):
    response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {
                        "role": "system",
                        "content": "Please rate the relevance of the abstract to the query on a scale of 0 to 100 where 100 is extremely relevant for the query in relation to the paper title."
                    },
                    {
                        "role": "user",
                        "content": f"{prompt} \n Score:"
                    }
                ],
                max_tokens=1000,
                n=1,
                stop=None,
                temperature=0.7,
            )

    result = response["choices"][0]["message"]["content"].strip()
    return result


# Function to get relevance score using GPT model
def get_relevance_score(query, title):
    # Create the prompt for GPT
    prompt = f"How relevant is the paper titled '{title}' to the query '{query}'? Please rate on a scale from 0 to 100, with 100 being extremely relevant."

    # Get the response from GPT
    response = gpt4_response(prompt)

    # Split the response string into components and get the first one (the score)
    score_str = response.split(".")[0]

    # Convert the score string to a float relevance score
    relevance_score = float(score_str)
    print(f"Relevance Score:{relevance_score}")
    return relevance_score


# Function to download ArXiv papers
def download_arxiv_pdfs(df, download_dir):
    for index, row in df.iterrows():
        pdf_url = row['pdf_url']
        pdf_title = row['title']

        # Generate a valid filename by removing invalid characters
        pdf_filename = ''.join(c if c.isalnum() else '_' for c in pdf_title)
        pdf_path = os.path.join(download_dir, f"{pdf_filename}.pdf")

        try:
            urllib.request.urlretrieve(pdf_url, pdf_path)
            print(f"Downloaded {pdf_title} to {pdf_path}")
        except Exception as e:
            print(f"Failed to download {pdf_title}: {e}")


# Function to get summaries using AI model
def get_summaries(client, text):
    text = truncate_string_to_tokens(text, 25000)
    summaries = []
    for chunk in client.send_message("a2_100k", text):
        summaries.append(chunk["text_new"])
    return ' '.join(summaries)

def get_summaries_with_backoff_and_timeout(client, text, retries=5, backoff_factor=0.1, timeout=30):
    for i in range(retries):
        try:
            with ThreadPoolExecutor() as executor:
                future = executor.submit(get_summaries, client, text)
                done, _ = wait([future], timeout=timeout, return_when=FIRST_COMPLETED)

                if future in done:
                    return future.result()
                else:
                    raise TimeoutError("The get_summaries function timed out")

        except Exception as e:
            if i < retries - 1:  # if not the last attempt
                sleep_time = backoff_factor * (2 ** i)  # exponential backoff
                time.sleep(sleep_time)
            else:
                raise e



def get_relevance_score_with_backoff_and_timeout(query, title, retries=5, backoff_factor=0.1, timeout=30):
    for i in range(retries):
        try:
            with ThreadPoolExecutor() as executor:
                future = executor.submit(get_relevance_score, query, title)
                done, _ = wait([future], timeout=timeout, return_when=FIRST_COMPLETED)

                if future in done:
                    return future.result()
                else:
                    raise TimeoutError("The get_relevance_score function timed out")

        except Exception as e:
            if i < retries - 1:  # if not the last attempt
                sleep_time = backoff_factor * (2 ** i)  # exponential backoff
                time.sleep(sleep_time)
            else:
                raise e



def main(query="transgender etiology", max_results=10, threshold=70, download_dir='/content', sort_by=arxiv.SortCriterion.SubmittedDate):
    # Get ArXiv papers and create DataFrame
    search = arxiv.Search(
        query=str("abs:" + query),
        max_results=max_results,
        sort_by=sort_by
    )

    papers = []
    for result in search.results():
        papers.append(result)
    print(papers)
    papers_df = pd.DataFrame([vars(paper) for paper in papers])

    # Set up POE client. You can get this by logging in to your account in the web browser, right click, go to inspect, then applications, then get the token from the cookie.
    client = poe.Client("your poe token")

    # Add relevance scores to DataFrame
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_index = {executor.submit(get_relevance_score_with_backoff_and_timeout, query, row['summary']): index for index, row in papers_df.iterrows()}
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                relevance_score = future.result()
            except Exception as e:
                print(f"Failed to get relevance score: {e}")
            else:
                papers_df.at[index, 'relevance_score'] = relevance_score

    # Filter DataFrame to only include papers with a relevance score above the threshold
    papers_df = papers_df[papers_df['relevance_score'] > threshold]

    # Download PDFs and get summaries for each paper
    for index, row in papers_df.iterrows():
        pdf_url = row['pdf_url']
        pdf_title = row['title']

        # Generate a valid filename by removing invalid characters
        pdf_filename = ''.join(c if c.isalnum() else '_' for c in pdf_title)
        pdf_path = os.path.join(download_dir, f"{pdf_filename}.pdf")

        try:
            urllib.request.urlretrieve(pdf_url, pdf_path)
            print(f"Downloaded {pdf_title} to {pdf_path}")

            text = convert_pdf_to_text(pdf_path)
            summary = get_summaries_with_backoff_and_timeout(client, text)


            papers_df.at[index, 'poe summary'] = summary

            print(f"Generated summary for {pdf_title}")
        except Exception as e:
            print(f"Failed to process {pdf_title}: {e}")

    # Save the final DataFrame to a CSV file
    papers_df.to_csv('papers_with_summaries.csv')
    return papers_df

# Run the main function
if __name__ == "__main__":
    main(query="F5 Tornado", max_results=50, threshold=50, download_dir='/content')



In [None]:
papers_df

In [None]:
papers_df.to_csv("summaries.csv")