In [138]:
## Use cases

#1 Competitor publications
#2 Financial news for traders

In [148]:
## Import libraries

from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
import streamlit as st
import os
from dotenv import load_dotenv
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import json
from dotenv import dotenv_values
from googlesearch import search
import requests

In [140]:
## Setup env vars 

# take environment variables from .env.
load_dotenv()

# config = {"USER": "foo", "EMAIL": "foo@example.org"}
env_vars = dotenv_values(".env") 


In [141]:
## Define system relevant input data for application
HARD_LIMIT_CHAR = 10000

In [142]:
## Functions

def tag_visible(element):
    if element.parent.name in ['a','style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)


def extract_json_values(input_str):
    results = []
    while input_str:
        try:
            value = json.loads(input_str)
            input_str = ""
        except json.decoder.JSONDecodeError as exc:
            if str(exc).startswith("Expecting value"):   
                input_str = input_str[exc.pos+1:]
                continue
            elif str(exc).startswith("Extra data"):
                value = json.loads(input_str[:exc.pos])
                input_str = input_str[exc.pos:]
        results.append(value)
    return results


In [143]:
## User input data

#TODO : DO URL Check and show message when not valid

#Web Scrapping and UI
url_to_watch = st.text_input("Input your url here","https://www.nytimes.com/international/section/politics")
#url_to_watch = st.text_input("Input your url here","https://laion.ai/blog/")

In [144]:
## Process website and save content to file

html = urllib.request.urlopen(url_to_watch).read()
text_from_webpage = text_from_html(html)
#TODO : Fixe this limit, in a smarter way
text_from_webpage = text_from_webpage[:HARD_LIMIT_CHAR]

print(html)
print("BREAK#################")
print(text_from_webpage)

# Logging
file_path = "output.txt"
with open(file_path, "w") as file:
    file.write(text_from_webpage)
print("Variable content saved to the file:", file_path)



BREAK#################
   Sections SEARCH  Advertisement Supported by U.S. Politics Highlights Photo Credit Sophie Park for The New York Times The notice from the office of the special counsel Jack Smith suggested that an indictment was on the horizon in the investigation into the former president’s handling of classified documents.  By  Alan Feuer , Maggie Haberman , William K. Rashbaum and Glenn Thrush Photo Credit Kenny Holston/The New York Times Members of the ultraconservative House Freedom Caucus refused to surrender control of the floor, forcing G.O.P. leaders to scrap votes for the week and leaving the speaker facing what he conceded was “chaos.”  By  Annie Karni Photo Credit The former vice president — and now rival — to Donald Trump gave his most aggressive criticism of his former boss, portraying him as unfit to be president.  By  Jonathan Swan Photo Credit David Degner for The New York Times political memo The flights to California illustrate the broader bet Gov. Ron DeSant

  texts = soup.findAll(text=True)


In [145]:
# LLM part
# if st.button('Analyze'):
prompt = PromptTemplate(
    input_variables=["webpage"],
    template="In this web page, can you find a pattern, list all the articles, their hyperlinks and their publication dates. Do not mix the date with the reading time. Limit yourself to the first 3. In Json format, using these keys \"title\", \"date\", \"hyperlink\". No Other text. \
        webpage :  \"{webpage}\"",
    )
prompt_to_send = prompt.format(webpage=text_from_webpage)


# Count tokens in request
tokens_total = (len(text_from_webpage) + len(prompt_to_send))/4
print("Number of tokens in request: ~" + str(round(tokens_total)))
print("Request costs: ~ $" + str(round(((tokens_total/1000)*0.002),2)))

Number of tokens in request: ~1952
Request costs: ~ $0.0


In [146]:
llm = OpenAI(openai_api_key=env_vars['OPENAI_API_KEY'],temperature=0.9)
result_from_chatgpt = llm(prompt_to_send).replace("\n", "")
#print(result_from_chatgpt)
print(json.dumps(json.loads(result_from_chatgpt), indent=4))
file_path = "gpt_out.txt"

parsed_articles = json.loads(result_from_chatgpt)
#Logging
file_path = "output_gpt.txt"
with open(file_path, "w") as file:
    file.write(result_from_chatgpt)
print("Variable content saved to the file:", file_path)


[
    {
        "title": "Jack Smith Indictment Imminent, Special Counsel Office Notifies",
        "date": "2023",
        "hyperlink": ""
    },
    {
        "title": "Ultraconservative House Freedom Caucus Causes Chaos in Congress",
        "date": "2023",
        "hyperlink": ""
    },
    {
        "title": "Joe Biden Critiques Donald Trump's Fitness to be President",
        "date": "2023",
        "hyperlink": ""
    }
]
Variable content saved to the file: output_gpt.txt


In [149]:
search = 'Ultraconservative House Freedom Caucus Causes Chaos in Congress'
url = 'https://www.google.com/search'

headers = {
	'Accept' : '*/*',
	'Accept-Language': 'en-US,en;q=0.5',
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82',
}
parameters = {'q': search}

content = requests.get(url, headers = headers, params = parameters).text
soup = BeautifulSoup(content, 'html.parser')

search = soup.find(id = 'search')
first_link = search.find('a')

print(first_link['href'])

https://www.nytimes.com/2023/06/07/us/politics/mccarthy-house-republicans-mutiny.html


TODOS
- get summarized article via google search
- allow input questions about article
- answer question
- filter based on user interests
- use langchain json parser