In [4]:
## Use cases

#1 Competitor publications
#2 Financial news for traders

In [5]:
## Import libraries

from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
import streamlit as st
import os
from dotenv import load_dotenv
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import json
from dotenv import dotenv_values
from googlesearch import search

In [6]:
## Setup env vars 

# take environment variables from .env.
load_dotenv()

# config = {"USER": "foo", "EMAIL": "foo@example.org"}
env_vars = dotenv_values(".env") 


In [7]:
## Define system relevant input data for application
HARD_LIMIT_CHAR = 10000 

In [8]:
## Functions

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)


def extract_json_values(input_str):
    results = []
    while input_str:
        try:
            value = json.loads(input_str)
            input_str = ""
        except json.decoder.JSONDecodeError as exc:
            if str(exc).startswith("Expecting value"):   
                input_str = input_str[exc.pos+1:]
                continue
            elif str(exc).startswith("Extra data"):
                value = json.loads(input_str[:exc.pos])
                input_str = input_str[exc.pos:]
        results.append(value)
    return results


In [14]:
## User input data

#TODO : DO URL Check and show message when not valid

#Web Scrapping and UI
#url_to_watch = st.text_input("Input your url here","https://www.nytimes.com/international/section/politics")
url_to_watch = st.text_input("Input your url here","https://laion.ai/blog/")

In [15]:
## Process website and save content to file

html = urllib.request.urlopen(url_to_watch).read()
text_from_webpage = text_from_html(html)
#TODO : Fixe this limit, in a smarter way
text_from_webpage = text_from_webpage[:HARD_LIMIT_CHAR]

# Logging
file_path = "output.txt"
with open(file_path, "w") as file:
    file.write(text_from_webpage)
print("Variable content saved to the file:", file_path)



Variable content saved to the file: output.txt


  texts = soup.findAll(text=True)


In [26]:
# LLM part
# if st.button('Analyze'):
prompt = PromptTemplate(
    input_variables=["webpage"],
    template="In this web page, can you find a pattern, list all the articles and their publication dates. Do not mix the date with the reading time. Limit yourself to the first 3. In Json format, using these keys \"title\", \"date\". No Other text. \
        webpage :  \"{webpage}\"",
    )
prompt_to_send = prompt.format(webpage=text_from_webpage)


# Count tokens in request
tokens_total = (len(text_from_webpage) + len(prompt_to_send))/4
print("This request contained the following number of tokens: ~" + str(round(tokens_total)))

This request contained the following number of tokens: ~3456


In [None]:
llm = OpenAI(openai_api_key=env_vars['OPENAI_API_KEY'],temperature=0.9)
result_from_chatgpt = llm(prompt_to_send).replace("\n", "")
print(result_from_chatgpt)
file_path = "gpt_out.txt"

parsed_articles = json.loads(result_from_chatgpt)
#Logging
file_path = "output_gpt.txt"
with open(file_path, "w") as file:
    file.write(result_from_chatgpt)
print("Variable content saved to the file:", file_path)
