In [9]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
from langchain_openai import OpenAIEmbeddings
from newspaper import Article
import threading
from urllib.parse import urlparse

In [4]:
load_dotenv()
mongo_client = MongoClient(os.getenv("MONGODB_URI"))


In [5]:
#--- Knowledge DB ----#
db = mongo_client.get_database("knowledge_db")
newsArticleCollection = db["tech_articles"]
api_key = os.getenv("OPENAI_API_KEY")

In [14]:
def urlScrapeAndStore(url):

    article = Article(url)
    article.download()
    article.parse()

    # Extract source
    parsed_url = urlparse(url)
    source  = parsed_url.netloc
    
    # Extract author
    try:
        author = article.authors[0]
    except:
        author= "Not Found"
    try:
        author = article.authors[0]
    except:
        author= "Not Found"

    # Extract title
    title  = article.title
    
    # Scrap the full content from the URL
    content  = article.text

    return content


In [15]:
def check_duplicate(article_embedding, collection):

    #article_embeddings = OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-large", dimensions=1536) # model used to embed article
    #article_embedding = article_embeddings.embed_query(article)
    pipeline = [
        {
            "$vectorSearch": { # $vectorSearch is the specific function name
                "index": "vector_index", # The search index I created on MongoDB
                "queryVector": article_embedding, # The embedded query from the user that is used for searching
                "path": "embeddedContent", # The relevant field of the document that is used for searching (in this case the full text of the news article)
                "limit": 15, # How many results you want the vectorSearch to show
                "numCandidates": 100 # How many documents you want vectorSearch to consider when searching
            }
        }, 
        {
                '$project': {
                '_id': 0, 
                'plot': 1, 
                'title': 1, 
                'score': {
                    '$meta': 'vectorSearchScore'
                }
            }
        }
    ]
    
    results = collection.aggregate(pipeline) # executing the search
    #article_check_duplicate = "In another example, Yue asks the phone to find a gift for his grandma who cannot get out of bed. It generated an interface with several products within carousels, and each row had a brief explanation of why the product might be a good fit. He settled on the Kindle.\n\nYue then did a long-press on the product card to ask another query: \"What is the screen made of?\" The phone generated the answer as a paragraph of text below (notably with no sources), and when he then asked to watch unboxing videos, it added a row of YouTube videos on the topic.\n\nThis wizardry is reminiscent of Siri cofounder Dag Kittlaus' onstage demo of Viv way back in 2016, which was designed to be a conversational smart layer that let users interact with various services. His live demo also included asking by voice the digital assistant to book him a hotel room in Palm Springs. Clearly mighty impressed, Samsung snapped up Viv later that same year, and we've not really seen anything of it since.\n\nYou can get a pretty good glimpse of how Brain Technologies' tech works with its app, Natural AI, which it released in 2020. Yue says his company pioneered the large action models that can enable a digital AI assistant to execute tasks. Since the company had an early start, its AI can purportedly generate interfaces for more than 4 million functions it has trained since 2016. That should cover almost anything you can do on a computing device. “Instead of going to apps, apps come to you,” he says.\n\nBut Yue doesn’t think we’re moving away from apps just yet. That’s why this concept device is still an Android phone. If you don’t want to converse with the AI, you can access apps just like normal. The touchscreen isn’t going away either, and he believes this concept is the right combination of AI and a graphical interface.\n\nBrain Technologies has apparently already received tremendous interest from other manufacturers, and Yue says it's the only AI company the Emerson Collective (Laurene Powell Jobs' venture capital firm) has invested in. It seems almost inevitable that we'll see its generated interfaces in more kinds of devices in the future.\n\n“Everything is app-centric,” Yue says. “We’re trying to build a human-centric future. We’re trying to give people more power in this relationship. At the end of the day, whatever the next best interface is, wins.”\n\nSierra, a startup developing AI-powered agents to “elevate the customer experience” for big companies including WeightWatchers, Sonos, and SiriusXM, is of a similar view, stating that, in the future, a company’s AI version of itself will be just as, if not more, important as its app or website. “It's going to completely change the way companies exist digitally,” says Bret Taylor, who left his job as co-CEO of Salesforce to start Sierra.\n\nHuman After All\n\nThe founders of A Phone, A Friend—Tomas Ramanauskas and Tomas Dirvonskas—echoed the same sentiments on making phones more personal with the help of AI. “We think that AI gives an opportunity to humanize this relationship to actually make it more human instead of just this cold, transactional, attention economy kind of thing,” Ramanauskas says."
    #result_check = check_duplicate(article_check_duplicate, newsArticleCollection)
    for i in results:
        print(i)
        break
    
    return ""

In [13]:
article_embeddings = OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-large", dimensions=1536) # model used to embed article


In [18]:
article = urlScrapeAndStore("https://www.pbs.org/newshour/economy/openai-reveals-voice-engine-but-wont-yet-release-it-publicly-due-to-safety-concerns")
article2 = urlScrapeAndStore("https://www.pbs.org/newshour/economy/openai-reveals-voice-engine-but-wont-yet-release-it-publicly-due-to-safety-concerns")
article3 = urlScrapeAndStore("https://apnews.com/article/openai-voice-engine-aigenerated-clone-chatgpt-87da88d979ea5c75e98c75914740bd85")
article4 = urlScrapeAndStore("https://www.taipeitimes.com/News/biz/archives/2024/04/01/2003815756")
article5 = urlScrapeAndStore("https://www.oregonlive.com/business/2024/03/openai-reveals-voice-cloning-technology-but-wont-release-it-yet-over-impersonation-concerns.html")
article6 = urlScrapeAndStore("https://www.washingtontimes.com/news/2024/mar/29/openai-reveals-voice-engine-wont-yet-publicly-rele/")



try:
    info_a = article_embeddings.embed_query(article)
except:
    print("Error in content")

output = check_duplicate(info_a,newsArticleCollection)
print(output)

try:
    info_b = article_embeddings.embed_query(article2)
except:
    print("Error in content")

output = check_duplicate(info_a,newsArticleCollection)
print(output)

try:
    info_a = article_embeddings.embed_query(article3)
except:
    print("Error in content")

output = check_duplicate(info_a,newsArticleCollection)
print(output)

try:
    info_a = article_embeddings.embed_query(article4)
except:
    print("Error in content")

output = check_duplicate(info_a,newsArticleCollection)
print(output)

try:
    info_a = article_embeddings.embed_query(article5)
except:
    print("Error in content")

output = check_duplicate(info_a,newsArticleCollection)
print(output)

try:
    info_a = article_embeddings.embed_query(article6)
except:
    print("Error in content")

output = check_duplicate(info_a,newsArticleCollection)
print(output)

{'title': 'OpenAI reveals Voice Engine, but won’t yet publicly release the risky AI voice-cloning technology', 'score': 0.9916411638259888}

{'title': 'OpenAI reveals Voice Engine, but won’t yet publicly release the risky AI voice-cloning technology', 'score': 0.9916411638259888}

{'title': 'OpenAI reveals Voice Engine, but won’t yet publicly release the risky AI voice-cloning technology', 'score': 0.9999868869781494}

{'title': 'OpenAI reveals Voice Engine, but won’t yet publicly release the risky AI voice-cloning technology', 'score': 0.9628946781158447}

{'title': 'OpenAI reveals Voice Engine, but won’t yet publicly release the risky AI voice-cloning technology', 'score': 0.9881179332733154}

{'title': 'OpenAI reveals Voice Engine, but won’t yet publicly release the risky AI voice-cloning technology', 'score': 0.9958173036575317}

