# Optional setup: Download the spacy model

In [17]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Using cached spacy_curated_transformers-0.2.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Using cached curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Using cached curated_tokenizers-0.0.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting torch>=1.12.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-c

# Source

In [2]:

from googlesearch import search
from typing import NamedTuple, Iterable

class QueryCore(NamedTuple):
    name: str
    url: str

# Example companies and products for testing
s6 = QueryCore("Singularity 6", "singularity6.com")
palia = QueryCore("Palia", "palia.com")
radai = QueryCore("Rad AI", "radai.com")
pomelo = QueryCore("Pomelo Care", "pomelocare.com")
muddyrobot = QueryCore("Muddy Robot Games", "muddyrobot.com")
rafa = QueryCore("98point6", "98poin6.com")

excluded_domains = {
    "linkedin.com",
    "twitter.com",
    "x.com",
    "youtube.com",
    "reddit.com",
    "facebook.com",
}

def news_search(core_query: QueryCore, num=10, stop=10, pause=2, last_unit='m') -> Iterable[str]:
    query = f'"{core_query.name}""'
    query += " " + " ".join(f"-site:{domain}" for domain in excluded_domains)

    yield from search(query, num=num, stop=stop, pause=pause, tbs=f'qdr:{last_unit}', extra_params={'tbm': 'nws'})

def test_news_search():
    for url in news_search(pomelo, stop=20, pause=2):
        print(url)

# test_news_search()

In [10]:
import requests
from bs4 import BeautifulSoup
from functools import lru_cache
from typing import Optional

@lru_cache(maxsize=1000)
def get_article_text(url: str) -> Optional[str]:
    try:
        response = requests.get(url, timeout=5, headers={'Accept': 'text/html', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'})
    except requests.exceptions.ReadTimeout as e:
        print(f"Timeout on {url}")
        return None

    if response.ok:
        soup = BeautifulSoup(response.text, 'html.parser')
        article = soup.find('article')
        if article:
            return article.get_text()
    else:
        print(f"Failed to get article from {url}: {response.status_code}")
        
    return None
    
def test_get_article_text():
    print(get_article_text("https://www.pcgamesinsider.biz/news/74555/daybreak-acquires-palia-maker-singularity-6/"))


In [4]:
from pprint import pprint
from collections import Counter
import spacy

LABEL_MAPPINGS = {
    "ORG": "ORG/PRODUCT",
    "PRODUCT": "ORG/PRODUCT",
}

def munge_label(label: str) -> str:
    return LABEL_MAPPINGS.get(label, label)

# Load the trf model
nlp = spacy.load("en_core_web_trf")

def extract_entities(text: Optional[str]) -> Counter:
    if not text:
        return Counter()
    
    doc = nlp(text)

    return Counter((munge_label(ent.label_), ent.text) for ent in doc.ents)

def test_extract_entities():
    text = get_article_text("https://www.pcgamesinsider.biz/news/74555/daybreak-acquires-palia-maker-singularity-6/")
    pprint(extract_entities(text))

    text = get_article_text("https://massivelyop.com/2024/07/01/daybreak-bought-palia-studio-singularity-6-and-aims-to-bring-the-game-to-launch/")
    pprint(extract_entities(text))

# Note: The PRODUCT vs ORG on these isn't very good

  model.load_state_dict(torch.load(filelike, map_location=device))


In [7]:
# run the actual search, process each page, then aggregate both TF and DF
from collections import defaultdict
from urllib.parse import urlparse

entity_counts = Counter()
entity_document_counts = Counter()
entity_domains = defaultdict(set)

documents = {}

for url in news_search(rafa, stop=30, last_unit='y'):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    text = get_article_text(url)

    documents[url] = text

    if not text:
        print(f"Failed to extract text from {url}, skipping")
        continue

    counts = extract_entities(text)

    entity_counts.update(counts)
    for entity, count in counts.items():
        entity_document_counts[entity] += 1
        entity_domains[entity].add(domain)


Failed to get article from https://www.axios.com/pro/health-tech-deals/2024/01/16/98point6-acquires-brightmd-assets-telehealth-care-enablement: 403
Failed to extract text from https://www.axios.com/pro/health-tech-deals/2024/01/16/98point6-acquires-brightmd-assets-telehealth-care-enablement, skipping


  with torch.cuda.amp.autocast(self._mixed_precision):


Failed to extract text from https://www.98point6.com/pxs2023-meetwithus/, skipping
Failed to extract text from https://www.bizjournals.com/seattle/news/2024/04/24/98point6-layoffs-seattle-telehealth-transcarent.html, skipping
Failed to extract text from https://www.98point6.com/press_release/98point6-steadymd/, skipping
Failed to get article from https://www.geekwire.com/2024/98point6-hit-by-new-layoffs-in-latest-change-at-health-tech-startup/: 403
Failed to extract text from https://www.geekwire.com/2024/98point6-hit-by-new-layoffs-in-latest-change-at-health-tech-startup/, skipping
Failed to extract text from https://www.98point6.com/platform/, skipping
Failed to extract text from https://apps.apple.com/us/app/98point6/id1157653928, skipping
Failed to extract text from https://www.98point6.com/campaign/schedule-demo/, skipping
Failed to extract text from https://www.98point6.com/blog-test/, skipping
Failed to extract text from https://reportcards.ncqa.org/other-health-care-organizatio

In [8]:
top_entities = sorted(entity_document_counts.keys(), key=lambda entity: len(entity_domains[entity]), reverse=True)

for entity in top_entities[:20]:
    num_documents = entity_document_counts[entity]
    num_domains = len(entity_domains[entity])
    num_mentionds = entity_counts[entity]

    print(f"{entity}: on {num_domains} domains in {num_documents} documents ({num_mentionds} mentions total)")

('ORG/PRODUCT', '98point6'): on 4 domains in 5 documents (26 mentions total)
('ORG/PRODUCT', 'Bright.md'): on 3 domains in 4 documents (23 mentions total)
('ORG/PRODUCT', 'Evernorth'): on 3 domains in 4 documents (13 mentions total)
('ORG/PRODUCT', '98point6 Technologies'): on 3 domains in 3 documents (11 mentions total)
('CARDINAL', '16'): on 3 domains in 3 documents (3 mentions total)
('DATE', 'last year'): on 2 domains in 3 documents (3 mentions total)
('DATE', '2023'): on 2 domains in 2 documents (2 mentions total)
('DATE', '2015'): on 2 domains in 2 documents (2 mentions total)
('GPE', 'Seattle'): on 2 domains in 2 documents (2 mentions total)
('ORG/PRODUCT', 'Transcarent'): on 2 domains in 2 documents (3 mentions total)
('ORG/PRODUCT', 'Baptist Health'): on 2 domains in 2 documents (4 mentions total)
('ORG/PRODUCT', 'UAB Medicine'): on 2 domains in 2 documents (2 mentions total)
('CARDINAL', '14'): on 2 domains in 2 documents (2 mentions total)
('PERCENT', '64%'): on 2 domains in

In [9]:
documents

{'https://www.axios.com/pro/health-tech-deals/2024/01/16/98point6-acquires-brightmd-assets-telehealth-care-enablement': None,
 'https://www.healthcaredive.com/news/98point6-buys-bright-md-assets/704661/': '\n\n\n\nAn article from\n\n\n\n\n\n            \n                Dive Brief\n            \n        \n\n\n                98point6 buys asynchronous telehealth company Bright.md’s remaining assets\n            \nThe acquisition comes months after Bright.md’s technology was purchased by Cigna’s Evernorth segment.\n\n\n\n\n\n        Published Jan. 17, 2024\n    \n\n\n\n\n\n\n\n\nEmily Olsen\nReporter\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        gorodenkoff via Getty Images\n    \n\n\n\n\n\n\n\n\n\n\n            Listen to the article\n            2 min\n\n\n\n\n\n\n            This audio is auto-generated. Please let us know if you have feedback.\n        \n\n\nDive Brief:\n\n98point6 Technologies pur

In [25]:
text = documents["https://www.polygon.com/2024/7/17/24199912/palia-favorite-cozy-game-mmo"]
print(text.strip())

Filed under:



News





Early access life sim Palia has become my Stardew-style game of choice

The cozy game finally feels more like a MMO


    
      By
      
Cass Marshall



  Jul 17, 2024,  1:00pm EDT







      / 
new











Share this story




Share this on Facebook










Share this on Reddit








Share
All sharing options






Share
All sharing options for:
Early access life sim Palia has become my Stardew-style game of choice












Reddit







Pocket









Flipboard





Email




















Image: Singularity 6






Cass Marshall
        is a news writer focusing on gaming and culture coverage, taking a particular interest in the human stories of the wild world of online games.
      




I’ve been spending a good amount of my free time lately in early-access life sim Palia. Since its early access launch, I’ve found myself returning to my human’s comfortable home to garden, hunt, and hang out with friends. Palia has become my Stardew Valley

# Information extraction with LangChain

In [26]:
from dotenv import load_dotenv
load_dotenv()


True

In [27]:
# Define the schema for extraction
from typing import Optional

from langchain_core.pydantic_v1 import BaseModel, Field
from enum import Enum

class HairColor(Enum):
    BROWN = 'BROWN'
    BLONDE = 'BLONDE'
    BLACK = 'BLACK'
    GREY = 'GREY'
    WHITE = 'WHITE'
    RED = 'RED'
    OTHER = 'OTHER'

    def __str__(self):
        return self.value

class Person(BaseModel):
    """Information about a person."""

    name: Optional[str] = Field(default=None, description="The name of the person")

    hair_color: Optional[HairColor] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[float] = Field(
        default=None, description="Height measured in meters"
    )



In [28]:
from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)



In [10]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

runnable = prompt | llm.with_structured_output(schema=Person)

In [11]:
text = "Alan Smith is 6 feet tall and has blond hair."
runnable.invoke({"text": text})

Person(name='Alan Smith', hair_color=<HairColor.BLONDE: 'BLONDE'>, height_in_meters=1.83)

# Information extraction for the actual articles

In [46]:
# Define the schema for extraction
from typing import Optional, List

from langchain_core.pydantic_v1 import BaseModel, Field
from enum import Enum
from datetime import date

class Sentiment(Enum):
    POSITIVE = "POSITIVE"
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"

    def __str__(self):
        return self.value

class CustomDate(BaseModel):
    """A structued date"""
    year: int = Field(default=None, description="The year")
    month: int = Field(default=None, description="The month starting from 1")
    day: int = Field(default=None, description="The day of the month starting from 1")

    @classmethod
    def from_date(cls, d: date) -> 'CustomDate':
        return cls(year=d.year, month=d.month + 1, day=d.day + 1)

    def to_date(self) -> date:
        return date(year=self.year, month=self.month - 1, day=self.day - 1)

class OpinionArticle(BaseModel):
    """An opinion piece such as a product review, game review, or speculation about a business."""

    author_name: Optional[str] = Field(default=None, description="The name of the author")
    publication_date: Optional[CustomDate] = Field(default=None, description="The publication date")
    title: Optional[str] = Field(default=None, description="The title of the article")
    sentiment: Sentiment = Field(default=None, description="The sentiment of the article")

class Acquisition(BaseModel):
    """One company acquires another company."""

    acquiring_company: Optional[str] = Field(default=None, description="The name of the acquiring company")
    acquired_company: Optional[str] = Field(default=None, description="The name of the acquired company")
    acquisition_date: Optional[CustomDate] = Field(default=None, description="The date of the acquisition")

    # TODO: Structured
    acquisition_amount: Optional[str] = Field(default=None, description="The amount of the acquisition")

class Partnership(BaseModel):
    """A business partnership."""

    company_1: Optional[str] = Field(default=None, description="The name of the first company in the partnership")
    company_2: Optional[str] = Field(default=None, description="The name of the second company in the partnership")
    partnership_date: Optional[CustomDate] = Field(default=None, description="The date of the partnership")

    summary: Optional[str] = Field(default=None, description="A one-sentence extracted summary of the partnership")

class Fundraising(BaseModel):
    """A business that raises investment."""

    company: Optional[str] = Field(default=None, description="The name of the company that raised funds")
    fundraising_date: Optional[CustomDate] = Field(default=None, description="The date of the fundraising event")

    # TODO: Structured
    fundraising_amount: Optional[str] = Field(default=None, description="The amount of funds raised")

class Quote(BaseModel):
    """A person being quoted."""

    speaker: Optional[str] = Field(default=None, description="The person who made the quote")
    text: Optional[str] = Field(default=None, description="The text of the quote")

class BusinessNewsArticle(BaseModel):
    """News about a business."""

    author_name: Optional[str] = Field(default=None, description="The name of the author")
    publication_date: Optional[CustomDate] = Field(default=None, description="The publication date")
    title: Optional[str] = Field(default=None, description="The title of the article")
    quotes: List[Quote] = Field(default=[], description="Quotes included in the article (possibly empty)")

    acquisition_event: Optional[Acquisition] = Field(default=None, description="The acquisition event, if applicable")
    partnership_event: Optional[Partnership] = Field(default=None, description="The partnership event, if applicable")
    fundraising_event: Optional[Fundraising] = Field(default=None, description="The fundraising event, if applicable")



In [48]:
# Classification of the page
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            You're an expert algorithm to classify passages about a particular company or product.
            Only extract the properties mentioned in the 'Classification' function.
            """,
        ),
        (
            "human", 
            """
            Company: {company_name}
            Passage: 
            {text}
            """
            ),
    ]
)

class ArticleCategory(Enum):
    PRODUCT_UPDATE = "PRODUCT_UPDATE"
    BUSINESS_NEWS = "BUSINESS_NEWS"
    OPINION = "OPINION"
    OTHER = "OTHER"
    UNRELATED_TO_COMPANY = "UNRELATED_TO_COMPANY"
    
class Classification(BaseModel):
    sentiment: Sentiment = Field(description="The sentiment of the text")
    language: str = Field(description="The language the text is written in")
    category: ArticleCategory = Field(description="The category of the article with respect to the company")



In [49]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

runnable = prompt | llm.with_structured_output(schema=Classification)

In [50]:
runnable.invoke({"text": get_article_text("https://massivelyop.com/2024/07/01/daybreak-bought-palia-studio-singularity-6-and-aims-to-bring-the-game-to-launch/"), "company_name": "Singularity 6"})

Classification(sentiment=<Sentiment.NEUTRAL: 'NEUTRAL'>, language='English', category=<ArticleCategory.PRODUCT_UPDATE: 'PRODUCT_UPDATE'>)