In [1]:
def get_summary(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        div_class = 'caas-body'  
        div_element = soup.find('div', class_=div_class)
        return div_element.text if div_element else None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [9]:
import polars as pl
import matplotlib.pyplot as plt
import sys
from typing import Any


def read_csv(file_path: str, separator: str = '\t') -> pl.DataFrame:
    """Read a CSV file into a Polars DataFrame.
    
    Args:
        file_path (str): The path to the CSV file.
        separator (str): The separator used in the CSV file.
    
    Returns:
        pl.DataFrame: The DataFrame containing the CSV data.
    """
    return pl.read_csv(file_path, separator=separator)

def filter_health_data(df: pl.DataFrame) -> pl.DataFrame:
    """Filter the DataFrame for rows containing 'HEALTH' in the 'THEMES' column.
    
    Args:
        df (pl.DataFrame): The original DataFrame.
    
    Returns:
        pl.DataFrame: The filtered DataFrame.
    """
    return df.filter(pl.col("THEMES").str.contains("HEALTH"))

def get_source_shape_counts(df: pl.DataFrame) -> pl.DataFrame:
    """Group by 'SOURCES' and count the occurrences.
    
    Args:
        df (pl.DataFrame): The filtered DataFrame.
    
    Returns:
        pl.DataFrame: The DataFrame with source shape counts.
    """
    return (
        df.group_by("SOURCES")
        .agg(pl.count("SOURCES").alias("count"))
        .sort("count", descending=True)
    )

def save_to_csv(df: pl.DataFrame, file_path: str, separator: str = '\t') -> None:
    """Save the DataFrame to a CSV file.
    
    Args:
        df (pl.DataFrame): The DataFrame to save.
        file_path (str): The path to the CSV file.
        separator (str): The separator to use in the CSV file.
    """
    df.write_csv(file_path, separator=separator)


In [11]:
df = read_csv("20240816.gkg.csv")

In [15]:
df["SOURCEURLS"][2]

'https://www.yahoo.com/news/two-florida-congressional-dems-warn-171323616.html'

In [29]:
yahoo_df = df.filter(pl.col("SOURCEURLS").str.contains("yahoo"))
yahoo_df

DATE,NUMARTS,COUNTS,THEMES,LOCATIONS,PERSONS,ORGANIZATIONS,TONE,CAMEOEVENTIDS,SOURCES,SOURCEURLS
i64,i64,str,str,str,str,str,str,str,str,str
20240816,1,,"""EPU_CATS_HEALTHCARE;EPU_CATS_E…","""2#North Carolina, United State…","""kamala harris;wasserman schult…","""white house;heritage foundatio…","""2.33100233100233,3.49650349650…",,"""yahoo.com""","""https://www.yahoo.com/news/two…"
20240816,1,,"""LEADER;TAX_FNCACT;TAX_FNCACT_P…","""2#North Carolina, United State…","""tim miller;jeb bush""","""university of arkansas victori…","""-1.53846153846154,0,1.53846153…","""1192930971""","""yahoo.com""","""https://www.yahoo.com/news/tru…"
20240816,1,,"""TAX_MILITARY_TITLE;TAX_MILITAR…","""3#-Atoka, Oklahoma, United Sta…","""charles mccall;ryan walters;cy…","""oklahoma state department of e…","""-4.05405405405405,1.6891891891…","""1193095036,1193095037,11930846…","""yahoo.com""","""https://www.yahoo.com/news/rya…"
20240816,1,"""SEIZE#12#power#1#Thailand#TH#T…","""TAX_RELIGION;TAX_RELIGION_BUDD…","""4#Funan, Liaoning, China#CH#CH…","""maha vajiralongkorn;paetongtar…","""pheu thai party;southeast asia…","""-1.77664974619289,2.7072758037…","""1193033509,1193033510,11930335…","""yahoo.com""","""https://nz.news.yahoo.com/thai…"
20240816,1,,"""USPEC_POLICY1;EPU_POLICY;EPU_P…","""2#Michigan, United States#US#U…","""joe biden;donald trump;kamala …","""university of michigan;white h…","""-0.298507462686567,3.432835820…",,"""yahoo.com""","""https://www.yahoo.com/news/har…"
…,…,…,…,…,…,…,…,…,…,…
20240816,1,,"""TAX_ECON_PRICE;EPU_ECONOMY_HIS…","""1#Qatar#QA#QA#25.5#51.25#QA;4#…","""christian vernet""","""airbus;boeing;delta air lines;…","""0.955223880597015,3.1044776119…",,"""yahoo.com""","""https://news.yahoo.com/news/iv…"
20240816,1,,"""ELECTION;AGRICULTURE;UNGP_FORE…","""1#Switzerland#SZ#SZ#47#8#SZ""","""bloomberg businessweek""","""bloomberg""","""0.99009900990099,3.96039603960…","""1192969547,1192969549,11929697…","""yahoo.com""","""https://www.yahoo.com/news/swi…"
20240816,1,,,"""3#Chicago, Illinois, United St…","""matt eberflus;charles cui;rich…","""kaplan law firm;democratic nat…","""-3.08056872037915,1.4218009478…",,"""yahoo.com""","""https://www.yahoo.com/news/aft…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_FATHER;T…","""3#Placer County, California, U…","""kimberly blakley;barry zimmerm…","""child protective services;uc d…","""-7.00296735905045,1.0089020771…","""1193017381,1193014187,11930173…","""yahoo.com""","""https://www.yahoo.com/news/ros…"


In [30]:
yahoo_df["SOURCEURLS"].head().map_elements(get_summary)

  yahoo_df["SOURCEURLS"].head().map_elements(get_summary)


SOURCEURLS
str
"""Medicare recipients who take i…"
"""Vice President Harris will unv…"
"""State Superintendent Ryan Walt…"
"""Thailand is the only country i…"
"""Kamala Harris unveiled a new s…"
"""A North Texas woman was given …"
"""SIOUX FALLS, S.D. (KELO) — A n…"
"""The Ukrainian Foreign Ministry…"
"""The Justice Department on Frid…"
"""US Secretary of State Antony B…"


In [32]:
indiatimes_filtered = df.filter(pl.col("SOURCEURLS").str.contains("indiatimes"))
indiatimes_filtered

DATE,NUMARTS,COUNTS,THEMES,LOCATIONS,PERSONS,ORGANIZATIONS,TONE,CAMEOEVENTIDS,SOURCES,SOURCEURLS
i64,i64,str,str,str,str,str,str,str,str,str
20240816,1,,"""ELECTION;TAX_FNCACT;TAX_FNCACT…","""4#Jammu, Jammu And Kashmir, In…","""lok sabha;narendra modi;rajiv …","""kashmir police;supreme court;u…","""0.449775112443778,2.2488755622…","""1192999619,1192999620,11929996…","""indiatimes.com""","""https://economictimes.indiatim…"
20240816,1,,"""TAX_POLITICAL_PARTY;TAX_POLITI…","""1#Israel#IS#IS#31.5#34.75#IS;1…","""nate silver;joe biden;kamala h…","""international breaking news ev…","""-1.29659643435981,3.4035656401…","""1192929450,1193076984,11930769…","""indiatimes.com""","""https://economictimes.indiatim…"
20240816,1,,"""RURAL;ELECTION;TAX_ETHNICITY;T…","""1#Thailand#TH#TH#15#100#TH;4#B…","""srettha thavisin;thaksin shina…","""reuters;chulalongkorn universi…","""-1.0752688172043,2.82258064516…",,"""indiatimes.com""","""https://economictimes.indiatim…"
20240816,1,,"""LEADER;TAX_FNCACT;TAX_FNCACT_P…","""1#United States#US#US#39.82817…","""ajay bhutoria;harris indian;ka…","""democratic national convention""","""5.31400966183575,5.31400966183…","""1192952551,1192952555,11929526…","""indiatimes.com""","""https://timesofindia.indiatime…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_REPRESEN…","""1#Vietnam, Republic Of#VM#VM#1…",,,"""0.390625,0.78125,0.390625,1.17…","""1193045697,1193045698""","""indiatimes.com""","""https://realty.economictimes.i…"
…,…,…,…,…,…,…,…,…,…,…
20240816,1,,"""WB_1150_VOLATILITY;WB_1104_MAC…","""4#Hindustan, India (General), …","""george thomas;ajay thakur;sona…","""dabur;procter gamble""","""0.306748466257669,4.2944785276…","""1192919417""","""indiatimes.com""","""https://economictimes.indiatim…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_ASSISTAN…","""3#North Hollywood, California,…","""kenneth iwamasa;jasveen sangha…","""us postal inspection service""","""-8.95140664961637,0.5115089514…","""1192928152,1192944978,11929578…","""indiatimes.com""","""https://timesofindia.indiatime…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_NOMINEE;…","""2#New Jersey, United States#US…","""benjamin netanyahu;kamala harr…","""white house;university of mich…","""-1.09140518417463,3.9563437926…",,"""indiatimes.com""","""https://economictimes.indiatim…"
20240816,1,,"""LEADER;TAX_FNCACT;TAX_FNCACT_P…","""1#United States#US#US#39.82817…","""kamala harris""","""united states""","""7.14285714285714,7.14285714285…",,"""indiatimes.com""","""https://timesofindia.indiatime…"


In [59]:
def get_summary_it(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        div_class = 'summary'  
        div_element = soup.find('h2', class_=div_class)
        return div_element.text if div_element else None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [63]:
text_it = indiatimes_filtered["SOURCEURLS"].head(40).map_elements(get_summary_it)

  text_it = indiatimes_filtered["SOURCEURLS"].head(40).map_elements(get_summary_it)


In [65]:
print(text_it.drop_nulls().len())

14


In [62]:
indiatimes_filtered["SOURCEURLS"][0]

'https://economictimes.indiatimes.com/news/elections/assembly-elections/jammu-kashmir/jammu-kashmir-assembly-election-dates-when-it-will-be-held-all-you-need-to-know-ec-rajiv-kumar-article-370-delimitation/articleshow/112565368.cms'

In [66]:
eleader_df = df.filter(pl.col("SOURCEURLS").str.contains("enterpriseleader"))
eleader_df

DATE,NUMARTS,COUNTS,THEMES,LOCATIONS,PERSONS,ORGANIZATIONS,TONE,CAMEOEVENTIDS,SOURCES,SOURCEURLS
i64,i64,str,str,str,str,str,str,str,str,str
20240816,1,,"""ECON_STOCKMARKET;WB_698_TRADE;…","""1#United States#US#US#39.82817…",,"""holdings channel;pineridge adv…","""0.681818181818182,0.9090909090…",,"""theenterpriseleader.com""","""https://theenterpriseleader.co…"
20240816,1,,"""EPU_CATS_REGULATION;EPU_CATS_F…","""2#Arizona, United States#US#US…",,"""unitil co;mendota financial gr…","""2.4024024024024,2.552552552552…",,"""theenterpriseleader.com""","""https://theenterpriseleader.co…"
20240816,1,,"""ECON_STOCKMARKET;TAX_FNCACT;TA…","""1#United States#US#US#39.82817…",,"""commerce bank;royal bank;frees…","""2.04301075268817,2.36559139784…",,"""theenterpriseleader.com""","""https://theenterpriseleader.co…"
20240816,1,,"""TAX_ECON_PRICE;ECON_STOCKMARKE…","""1#Canada#CA#CA#60#-96#CA;1#Uni…",,"""principal securities inc;wells…","""1.11386138613861,1.60891089108…",,"""theenterpriseleader.com""","""https://theenterpriseleader.co…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_COUNSELO…","""1#Canada#CA#CA#60#-96#CA""",,"""associates corp;clearbridge in…","""1.64609053497942,2.26337448559…",,"""theenterpriseleader.com""","""https://theenterpriseleader.co…"
…,…,…,…,…,…,…,…,…,…,…
20240816,1,,"""EPU_CATS_REGULATION;EPU_CATS_F…","""1#Canada#CA#CA#60#-96#CA""","""sheryl sandberg;robert w baird…","""facebook;deutsche bank aktieng…","""1.72413793103448,1.80250783699…",,"""theenterpriseleader.com""","""https://theenterpriseleader.co…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_INVESTOR…","""1#United States#US#US#39.82817…","""gerald a ducey jr;james redgie…","""valeo financial advisors;excha…","""1.6246953696182,1.705930138099…",,"""theenterpriseleader.com""","""https://theenterpriseleader.co…"
20240816,1,"""AFFECT#1#analyst consensus est…","""TAX_FNCACT;TAX_FNCACT_HUNTER;T…","""1#United States#US#US#39.82817…","""spiceworks ziff davis""","""united kingdom ltd;ziff davis;…","""0.969827586206896,1.5086206896…",,"""theenterpriseleader.com""","""https://theenterpriseleader.co…"
20240816,1,,"""TAX_ECON_PRICE;TAX_FNCACT;TAX_…","""2#New Mexico, United States#US…","""franklin myers;piper sandler""","""wolfe research;pingora partner…","""1.11731843575419,1.78770949720…",,"""theenterpriseleader.com""","""https://theenterpriseleader.co…"


In [71]:
eleader_df[45]["SOURCEURLS"][0]

'https://theenterpriseleader.com/2024/08/16/skyline-champion-co-nysesky-director-sells-287860-64-in-stock.html'

In [74]:
import re
def multi_reader(url,skip_start=None,skip_end=None):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,'html')
    p_str = ''
    for text in soup.find_all('p'):
        p_str += str(text)
    pattern = '<p>(<strong>)?[^<](.*?)(<\/strong>)?<\/p>'
    matches = list(re.finditer(pattern,p_str))
    ptext = ''
    for match in matches[skip_start:skip_end]:
        ptext += match.group().replace('<p>','').replace('</p>','\n').replace('<strong>','').replace('</strong>','')
    
    return(ptext)

In [76]:
text_eleader = eleader_df["SOURCEURLS"].head(20).map_elements(multi_reader)

  text_eleader = eleader_df["SOURCEURLS"].head(20).map_elements(multi_reader)


In [79]:
text_eleader[11]

'Several equities research analysts have issued reports on AVGO shares. Mizuho raised their price objective on shares of Broadcom from $162.50 to $190.00 and gave the stock a “buy” rating in a research report on Thursday, June 13th. Evercore ISI boosted their price target on Broadcom from $162.00 to $201.00 and gave the company an “outperform” rating in a research note on Thursday, June 13th. Cantor Fitzgerald increased their price objective on Broadcom from $1,875.00 to $2,000.00 and gave the stock an “overweight” rating in a research report on Tuesday, July 16th. <a href="https://report.stocknews.com/sign-up/ticker/?ticker=AVGO&amp;lead_source=marketbeat_snra&amp;utm_source=marketbeat_snra&amp;utm_medium=article&amp;utm_campaign=ticker&amp;utm_term=AVGO&amp;utm_content=text_link" style="font-weight:bold">StockNews.com</a> cut shares of Broadcom from a “buy” rating to a “hold” rating in a research report on Tuesday, July 23rd. Finally, The Goldman Sachs Group upped their price target 

In [80]:
daily_mail_df= df.filter(pl.col("SOURCEURLS").str.contains("dailymail"))
daily_mail_df

DATE,NUMARTS,COUNTS,THEMES,LOCATIONS,PERSONS,ORGANIZATIONS,TONE,CAMEOEVENTIDS,SOURCES,SOURCEURLS
i64,i64,str,str,str,str,str,str,str,str,str
20240816,1,"""KILL#2014##1#Greece#GR#GR#39#2…","""TAX_FNCACT;TAX_FNCACT_WOMAN;TA…","""1#Greece#GR#GR#39#22#GR;3#Holl…","""katy perry;kamala harris;rose …","""national geographic;deutsche b…","""-1.08043217286915,4.0216086434…",,"""dailymail.co.uk""","""https://www.dailymail.co.uk/fe…"
20240816,1,"""KILL#6##5#Cumbria, United King…","""SOC_POINTSOFINTEREST;SOC_POINT…","""4#Wandsworth, Wandsworth, Unit…","""patrick downey;robert leslie s…","""yougov;national archives""","""-7.12309820193638,1.4522821576…","""1192943105,1192943106,11929431…","""dailymail.co.uk""","""https://www.dailymail.co.uk/ne…"
20240816,1,"""AFFECT#200##1#United States#US…","""TAX_DISEASE;TAX_DISEASE_CANCER…","""2#Florida, United States#US#US…","""karen everstine""","""el chilar""","""-4.55341506129597,0.8756567425…","""1192970290""","""dailymail.co.uk""","""https://www.dailymail.co.uk/he…"
20240816,1,,,"""4#Victorian Island, Victoria, …",,"""blue team jesse;disney;yellow …","""3.93258426966292,7.72471910112…",,"""dailymail.co.uk""","""https://www.dailymail.co.uk/tv…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_WOMAN;WB…","""4#Canberra, Australian Capital…","""philip lowe;warwick mckibbin""","""european union;australian nati…","""-2.19512195121951,0.7317073170…",,"""dailymail.co.uk""","""https://www.dailymail.co.uk/ne…"
…,…,…,…,…,…,…,…,…,…,…
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_ADMIRALT…","""2#New York, United States#US#U…","""pat mcfadden;david neal;rachel…","""foreign office;british civil s…","""-1.61127895266868,2.1148036253…",,"""dailymail.co.uk""","""https://www.dailymail.co.uk/ne…"
20240816,1,"""KILL#22#inside#1#Indonesia#ID#…","""WB_1979_NATURAL_RESOURCE_MANAG…","""4#Jambi, Jambi, Indonesia#ID#I…","""terjun gajah;pitu riawa""","""telluwanua police;national his…","""-4.31818181818182,0.4545454545…",,"""dailymail.co.uk""","""https://www.dailymail.co.uk/ne…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_SINGER;T…","""2#Hawaii, United States#US#USH…","""tesla cybertruck;justin bieber…","""peninsula hotel in los angeles""","""0.443458980044346,2.8824833702…",,"""dailymail.co.uk""","""https://www.dailymail.co.uk/tv…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_VICTIMS;…","""3#San Francisco, California, U…","""anne-christine massulo;prithik…","""court order;san francisco supe…","""-6.720741599073,3.128621089223…","""1193030888,1193031076,11930310…","""dailymail.co.uk""","""https://www.dailymail.co.uk/ne…"


In [81]:
text_daily_mail = daily_mail_df["SOURCEURLS"].head(20).map_elements(multi_reader)

  text_daily_mail = daily_mail_df["SOURCEURLS"].head(20).map_elements(multi_reader)


In [86]:
text_daily_mail

SOURCEURLS
str
"""Published by Associated Newspa…"
"""Published by Associated Newspa…"
"""Published by Associated Newspa…"
"""Published by Associated Newspa…"
"""Published by Associated Newspa…"
…
"""Published by Associated Newspa…"
"""Published by Associated Newspa…"
"""Published by Associated Newspa…"
"""Sign up to the Hull Live newsl…"


In [89]:
daily_mail_df["SOURCEURLS"][6]

'https://www.dailymail.co.uk/tvshowbiz/article-13749129/Lindsay-Lohan-Jamie-Lee-Curtis-Freaky-Friday-sequel.html'

In [106]:
def get_summary_dm(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        div_class = 'mol-para-with-font'  
        div_element = soup.find_all('p', class_=div_class)
        text = ""
        for z in div_element:
            text += z.text
        return text if div_element else None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [107]:
text_daily_mail = daily_mail_df["SOURCEURLS"].head(20).map_elements(get_summary_dm)

  text_daily_mail = daily_mail_df["SOURCEURLS"].head(20).map_elements(get_summary_dm)


In [110]:
text_daily_mail[19]

'Comedian Reginald D Hunter has compared an\xa0Israeli couple \'hounded\' out of his Edinburgh\xa0Fringe gig to Star Wars villains the \'Siths\'.The terrified Jewish couple revealed they were left \'in fear of violence\' after allegedly being called \'baby killers\' and told to \'f*** off\'.Hunter jested about the couple just a day after he was cleared by police of any\xa0\'hate crime\' being committed during his controversial show.Police Scotland confirmed \'no crime was established\' following claims that the\xa0American comic, 55, was\xa0\'encouraging a baying mob\' to harass the pair.Hunter had been performing his new stand-up show \'Fluffy Fluffy Beavers\' at the Edinburgh Fringe when he made a joke about how a\xa0Channel 5 documentary about domestic abuse had made him think of Israel.But when two Israeli audience members objected, they were reportedly sworn at, called \'genocidal\' and told \'you\'re not welcome\' by fellow spectators shouting \'Free Palestine\'.Hunter earlier re

In [112]:
wkrb13_df= df.filter(pl.col("SOURCEURLS").str.contains("wkrb13"))
wkrb13_df["THEMES"]

THEMES
str
"""EPU_CATS_REGULATION;EPU_CATS_F…"
"""TAX_FNCACT;TAX_FNCACT_INSIDER;…"
"""EPU_CATS_REGULATION;EPU_CATS_F…"
"""WB_439_MACROECONOMIC_AND_STRUC…"
"""WB_439_MACROECONOMIC_AND_STRUC…"
…
"""ECON_STOCKMARKET;TAX_FNCACT;TA…"
"""ECON_STOCKMARKET;TAX_FNCACT;TA…"
"""TAX_FNCACT;TAX_FNCACT_ANALYST;"""
"""ECON_STOCKMARKET;TAX_FNCACT;TA…"


In [120]:
filter_health_data(wkrb13_df)[34]["SOURCEURLS"][0]

'https://www.wkrb13.com/2024/08/16/insmed-nasdaqinsm-stock-rating-reaffirmed-by-hc-wainwright.html'

In [123]:
text_wkrb13= wkrb13_df["SOURCEURLS"].head(20).map_elements(multi_reader)

  text_wkrb13= wkrb13_df["SOURCEURLS"].head(20).map_elements(multi_reader)


In [128]:
text_wkrb13[9]

'Several other hedge funds and other institutional investors also recently made changes to their positions in the company. Envestnet Portfolio Solutions Inc. boosted its position in shares of  Exelon by 20.1% in the 2nd quarter. Envestnet Portfolio Solutions Inc. now owns 29,783 shares of the company’s stock worth $1,031,000 after purchasing an additional 4,991 shares during the last quarter.  ORG Partners LLC bought a new position in  Exelon during the second quarter worth about $99,000.  V Square Quantitative Management LLC lifted its holdings in  Exelon by 6.2% during the second quarter. V Square Quantitative Management LLC now owns 21,346 shares of the company’s stock worth $739,000 after acquiring an additional 1,249 shares in the last quarter.  Great Valley Advisor Group Inc. lifted its holdings in  Exelon by 56.6% during the second quarter. Great Valley Advisor Group Inc. now owns 22,800 shares of the company’s stock worth $789,000 after acquiring an additional 8,238 shares in t

In [132]:
ticker_report_df= df.filter(pl.col("SOURCEURLS").str.contains("tickerreport"))
ticker_report_df

DATE,NUMARTS,COUNTS,THEMES,LOCATIONS,PERSONS,ORGANIZATIONS,TONE,CAMEOEVENTIDS,SOURCES,SOURCEURLS
i64,i64,str,str,str,str,str,str,str,str,str
20240816,1,,"""TAX_WORLDMAMMALS;TAX_WORLDMAMM…","""1#Canada#CA#CA#60#-96#CA""",,"""national bank;envestnet portfo…","""1.55038759689922,1.55038759689…",,"""tickerreport.com""","""https://www.tickerreport.com/b…"
20240816,1,,"""GENERAL_HEALTH;MEDICAL;TAX_FNC…","""1#United States#US#US#39.82817…","""leerink partnrs""","""international assets investmen…","""4.95258166491043,5.37407797681…",,"""tickerreport.com""","""https://www.tickerreport.com/b…"
20240816,1,,"""ECON_STOCKMARKET;TAX_ECON_PRIC…","""1#Japan#JA#JA#36#138#JA;1#Chin…","""kate spade;manesh dadlani;stua…","""wells fargo company;united sta…","""2.22929936305733,2.54777070063…",,"""tickerreport.com""","""https://www.tickerreport.com/b…"
20240816,1,,"""WB_135_TRANSPORT;WB_1174_WAREH…","""1#United States#US#US#39.82817…",,"""united states;data storage cor…","""2.40963855421687,3.21285140562…",,"""tickerreport.com""","""https://www.tickerreport.com/b…"
20240816,1,,"""EDUCATION;RETIREMENT;WB_2690_C…","""1#United States#US#US#39.82817…","""roth mkm""","""international assets investmen…","""1.13065326633166,1.63316582914…",,"""tickerreport.com""","""https://www.tickerreport.com/b…"
…,…,…,…,…,…,…,…,…,…,…
20240816,1,,"""EDUCATION;RETIREMENT;WB_2690_C…","""2#California, United States#US…","""rainer h bosselmann;dekabank d…","""securities exchange commission…","""1.33882595262616,1.54479917610…",,"""tickerreport.com""","""https://www.tickerreport.com/b…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_ANALYST;…","""1#United States#US#US#39.82817…","""elizabeth rozek;sagimet biosci…","""sagimet biosciences company pr…","""0.168067226890756,1.6806722689…",,"""tickerreport.com""","""https://www.tickerreport.com/b…"
20240816,1,,"""GENERAL_HEALTH;MEDICAL;CRISISL…","""1#United States#US#US#39.82817…",,"""aethlon medical inc;united sta…","""0.673400673400673,3.0303030303…",,"""tickerreport.com""","""https://www.tickerreport.com/b…"
20240816,1,,"""EPU_CATS_REGULATION;EPU_CATS_F…","""1#Canada#CA#CA#60#-96#CA""","""livforsakringsbolaget skandia …","""veralto co;first command advis…","""1.53291253381425,1.80342651036…","""1192937938,1192950028""","""tickerreport.com""","""https://www.tickerreport.com/b…"


In [135]:
ticker_report_df[0]["SOURCEURLS"][0]

'https://www.tickerreport.com/banking-finance/12391611/envestnet-portfolio-solutions-inc-has-2-63-million-stake-in-ishares-cohen-steers-reit-etf-batsicf.html'

In [136]:
text_ticker= ticket_report_df["SOURCEURLS"].head(20).map_elements(multi_reader)

  text_ticker= ticket_report_df["SOURCEURLS"].head(20).map_elements(multi_reader)


In [138]:
text_ticker[0]

'A number of other hedge funds have also recently added to or reduced their stakes in ICF. National Bank of Canada FI grew its position in  iShares Cohen &amp; Steers REIT ETF by 17,774.7% in the fourth quarter. National Bank of Canada FI now owns 227,902 shares of the company’s stock worth $13,520,000 after acquiring an additional 226,627 shares in the last quarter.  William Howard &amp; Co Financial Advisors Inc bought a new position in shares of  iShares Cohen &amp; Steers REIT ETF in the first quarter valued at approximately $8,991,000.  KFG Wealth Management LLC acquired a new stake in shares of  iShares Cohen &amp; Steers REIT ETF in the fourth quarter worth $5,741,000.  Garde Capital Inc. lifted its stake in shares of  iShares Cohen &amp; Steers REIT ETF by 83.7% in the fourth quarter. Garde Capital Inc. now owns 164,218 shares of the company’s stock worth $9,645,000 after buying an additional 74,801 shares during the last quarter.  Finally, Transcendent Capital Group LLC acquir

In [139]:
business_mirror_df= df.filter(pl.col("SOURCEURLS").str.contains("businessmirror"))
business_mirror_df

DATE,NUMARTS,COUNTS,THEMES,LOCATIONS,PERSONS,ORGANIZATIONS,TONE,CAMEOEVENTIDS,SOURCES,SOURCEURLS
i64,i64,str,str,str,str,str,str,str,str,str
20240816,1,,"""EDUCATION;WB_470_EDUCATION;WB_…","""1#Philippines#RP#RP#13#122#RP;…",,"""manila southcoast development …","""9.47867298578199,9.95260663507…",,"""businessmirror.com.ph""","""https://businessmirror.com.ph/…"
20240816,1,,,"""4#Philippine, Benguet, Philipp…","""manuel v pangilinan;jane jimen…","""metro pacific investments corp…","""5.97014925373134,6.46766169154…",,"""businessmirror.com.ph""","""https://businessmirror.com.ph/…"
20240816,1,,"""EDUCATION;SOC_POINTSOFINTEREST…","""4#Pasig, City Of Pasig, Philip…","""victor ma;bahay bulilit;joseph…","""mcdonald;ronald mcdonald house…","""1.78748758689176,3.37636544190…","""1193069233,1193069234,11930699…","""businessmirror.com.ph""","""https://businessmirror.com.ph/…"
20240816,1,,"""EPU_ECONOMY;EPU_ECONOMY_HISTOR…","""1#Ukraine#UP#UP#49#32#UP;5#Mak…","""abby joseph cohen;ken moelis;j…","""columbia university;ayala muse…","""-1.52542372881356,2.2033898305…",,"""businessmirror.com.ph""","""https://businessmirror.com.ph/…"
20240816,1,,,"""1#Philippines#RP#RP#13#122#RP""","""arsenio nick j lizaso""","""roman senate;creative marketin…","""-1.21703853955375,4.5977011494…",,"""businessmirror.com.ph""","""https://businessmirror.com.ph/…"
…,…,…,…,…,…,…,…,…,…,…
20240816,1,,"""EPU_ECONOMY;EPU_ECONOMY_HISTOR…","""1#Ukraine#UP#UP#49#32#UP;4#Phi…","""abby joseph cohen;sam bankman;…","""columbia university;philippine…","""-1.52542372881356,2.2033898305…",,"""businessmirror.com.ph""","""https://businessmirror.com.ph/…"
20240816,1,"""WOUND#7##1#Thailand#TH#TH#15#1…","""TAX_ETHNICITY;TAX_ETHNICITY_AS…","""1#Philippines#RP#RP#13#122#RP;…","""putrama fahri septian;bhinijde…","""southeast asian volleyball lea…","""-2.30769230769231,1.5384615384…","""1193053013,1193053014,11930534…","""businessmirror.com.ph""","""https://businessmirror.com.ph/…"
20240816,1,,"""EPU_ECONOMY;EPU_ECONOMY_HISTOR…","""4#Philippine, Benguet, Philipp…","""abby joseph cohen;ken moelis;j…","""columbia university;ayala muse…","""-1.52542372881356,2.2033898305…",,"""businessmirror.com.ph""","""https://businessmirror.com.ph/…"
20240816,1,,"""TOURISM;TAX_FNCACT;TAX_FNCACT_…","""4#Dhaka, Dhaka, Bangladesh#BG#…","""ferdinand r marcos jr""","""department of tourism;facebook""","""4.21052631578947,5.86466165413…","""1193069185,1193070501""","""businessmirror.com.ph""","""https://businessmirror.com.ph/…"


In [144]:
business_mirror_df["SOURCEURLS"][0]

'https://businessmirror.com.ph/2024/08/16/sm-foundation-msdc-boost-early-education-in-batangas-with-refurbished-day-care-centers/'

In [150]:
def get_summary_bm(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        print(soup)
        div_class = 'post-main'  
        div_element = soup.find('div', class_=div_class)
        return div_element.text if div_element else None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [151]:
text_business_mirror= business_mirror_df["SOURCEURLS"].head(20).map_elements(get_summary_bm)

  text_business_mirror= business_mirror_df["SOURCEURLS"].head(20).map_elements(get_summary_bm)


<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>

<html>
<head>

In [148]:
text_business_mirror

SOURCEURLS
str
""
""
""
""
""
…
""
""
""
""


In [155]:
menanf_df= df.filter(pl.col("SOURCEURLS").str.contains("menafn"))
menanf_df

DATE,NUMARTS,COUNTS,THEMES,LOCATIONS,PERSONS,ORGANIZATIONS,TONE,CAMEOEVENTIDS,SOURCES,SOURCEURLS
i64,i64,str,str,str,str,str,str,str,str,str
20240816,1,,"""LEADER;TAX_FNCACT;TAX_FNCACT_P…","""4#Istanbul, Istanbul, Turkey#T…","""vik atal;maggie miller;najim m…","""virginia chamber;group inc;oly…","""8.26771653543307,8.66141732283…",,"""menafn.com""","""https://menafn.com/1108560592/…"
20240816,1,,,"""4#Rome, Lazio, Italy#IT#IT07#4…","""rafer johnson;yang chuan-kwang…","""united nations""","""3.55029585798817,5.62130177514…",,"""menafn.com""","""https://menafn.com/1108560961/…"
20240816,1,,"""ECON_BITCOIN;LEADER;TAX_FNCACT…","""1#United States#US#US#39.82817…","""joe biden;tim walz;kamala harr…","""exchange commission;democratic…","""0.575373993095512,3.6823935558…","""1192922517,1192919729,11929471…","""menafn.com""","""https://menafn.com/1108559863/…"
20240816,1,,"""WB_135_TRANSPORT;WB_1803_TRANS…","""1#Russia#RS#RS#60#100#RS;1#Kyr…","""zara araz""","""georgia marabda-kartsakhi rail…","""7.24191063174114,7.85824345146…",,"""menafn.com""","""https://menafn.com/1108561303/…"
20240816,1,"""KILL#73##1#Yemen#YM#YM#15.5#47…","""TAX_FNCACT;TAX_FNCACT_LEADER;T…","""1#United Kingdom#UK#UK#54#-4#U…","""abdul-malik al-houthi""","""xinhua""","""-3.73831775700935,1.4018691588…","""1192919099,1192919100,11929191…","""menafn.com""","""https://menafn.com/1108561289/…"
…,…,…,…,…,…,…,…,…,…,…
20240816,1,,"""GENERAL_HEALTH;MEDICAL;WB_1331…","""1#United States#US#US#39.82817…",,"""ministry of commerce""","""0.245700245700246,1.4742014742…","""1192964270,1192967611,11929645…","""menafn.com""","""https://menafn.com/1108561367/…"
20240816,1,,"""GENERAL_HEALTH;MEDICAL;TAX_FNC…","""4#Kolkata, West Bengal, India#…","""amit malviya;sanjay roy""","""rg kar medical college;kolkata…","""-15.5763239875389,0.3115264797…","""1192925519,1192965861,11929659…","""menafn.com""","""https://menafn.com/1108561335/…"
20240816,1,,,"""1#India#IN#IN#20#77#IN;4#Paris…","""sree bhai;harmanpreet singh;na…","""olympics""","""2.09205020920502,3.55648535564…",,"""menafn.com""","""https://menafn.com/1108561333/…"
20240816,1,,"""TAX_ETHNICITY;TAX_ETHNICITY_CH…","""1#China#CH#CH#35#105#CH;1#Japa…",,"""united nations;chinese academy…","""-2.31481481481481,1.2962962962…","""1192945111,1192945190,11929454…","""menafn.com""","""https://menafn.com/1108558756/…"


In [159]:
text_menanf_df= menanf_df["SOURCEURLS"].head(20).map_elements(multi_reader)

  text_menanf_df= menanf_df["SOURCEURLS"].head(20).map_elements(multi_reader)


In [158]:
 menanf_df["SOURCEURLS"][0]

'https://menafn.com/1108560592/PRA-Group-Leader-Wins-Silver-Stevie-Award-In-2024-International-Business-Awards'

In [161]:
text_menanf_df[0]

'\n"I am humbled to receive this prestigious award and to be recognized among top business leaders worldwide. This truly is a great honor, not just for me, but also for my team, as our shared success is highlighted on the global stage," said Kersey.\nKersey has more than twenty years of public affairs and government relations experience across various sectors. In her role at PRA Group, she leads public policy and communications efforts – internal and external global communications, branding, public and media relations, corporate philanthropy, community engagement and government relations. Kersey is a graduate of Virginia Tech and Old Dominion University. She serves on numerous executive and advisory boards, including the board of directors of Virginia FREE and the Virginia Chamber of Commerce.\nThe International Business Awards are the world\'s premier business awards program. All individuals and organizations worldwide – public and private, for-profit and non-profit, large and small –

In [162]:
etf_daily_df= df.filter(pl.col("SOURCEURLS").str.contains("etfdaily"))
etf_daily_df

DATE,NUMARTS,COUNTS,THEMES,LOCATIONS,PERSONS,ORGANIZATIONS,TONE,CAMEOEVENTIDS,SOURCES,SOURCEURLS
i64,i64,str,str,str,str,str,str,str,str,str
20240816,1,,"""ECON_STOCKMARKET;TAX_FNCACT;TA…","""1#Mexico#MX#MX#23#-102#MX;3#Mi…","""los angeles;inmobiliaria vesta""","""luxurban hotel;corphousing gro…","""3.39256865912762,4.52342487883…",,"""etfdailynews.com""","""https://www.etfdailynews.com/2…"
20240816,1,,"""ENV_MINING;ECON_STOCKMARKET;TA…","""1#Mexico#MX#MX#23#-102#MX;1#Ca…",,"""agnico eagle mines limited;zac…","""2.62257696693273,2.73660205245…",,"""etfdailynews.com""","""https://www.etfdailynews.com/2…"
20240816,1,,"""MANMADE_DISASTER_IMPLIED;EPU_E…","""2#Louisiana, United States#US#…","""robert w baird;zurcher kantona…","""sumitomo mitsui trust holdings…","""4.67005076142132,4.87309644670…",,"""etfdailynews.com""","""https://www.etfdailynews.com/2…"
20240816,1,,"""TAX_ECON_PRICE;TAX_FNCACT;TAX_…","""1#United States#US#US#39.82817…","""raymond jame;raymond james;rog…","""whitcomb hess inc;deuterium ca…","""1.26728110599078,1.95852534562…",,"""etfdailynews.com""","""https://www.etfdailynews.com/2…"
20240816,1,,"""TAX_ECON_PRICE;TAX_FNCACT;TAX_…","""1#United States#US#US#39.82817…","""robert w eddy;jeff desroches;r…","""united states;kayne anderson r…","""1.65700082850041,1.82270091135…",,"""etfdailynews.com""","""https://www.etfdailynews.com/2…"
…,…,…,…,…,…,…,…,…,…,…
20240816,1,,"""TAX_ECON_PRICE;LEADER;TAX_FNCA…","""1#United States#US#US#39.82817…","""robert w baird;donald e bobo j…","""barclays;goldman sachs group;s…","""0.86848635235732,2.10918114143…",,"""etfdailynews.com""","""https://www.etfdailynews.com/2…"
20240816,1,,"""ECON_STOCKMARKET;TAX_FNCACT;TA…","""1#United States#US#US#39.82817…",,"""origin materials company profi…","""1.43312101910828,2.07006369426…",,"""etfdailynews.com""","""https://www.etfdailynews.com/2…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_ANALYST;…","""1#Italy#IT#IT#42.833333#12.833…",,"""john wood group;news ratings f…","""3.65358592692828,4.73612990527…","""1192965669,1192966321,11929663…","""etfdailynews.com""","""https://www.etfdailynews.com/2…"
20240816,1,,"""WB_698_TRADE;TAX_FNCACT;TAX_FN…","""1#United States#US#US#39.82817…","""timothy patrick flynn;alice l …","""royal bank;walmart inc;sam clu…","""2.07650273224044,2.29508196721…","""1192937938""","""etfdailynews.com""","""https://www.etfdailynews.com/2…"


In [171]:
text_etf_daily_df=etf_daily_df["SOURCEURLS"].head(10).map_elements(multi_reader)

  text_etf_daily_df=etf_daily_df["SOURCEURLS"].head(10).map_elements(multi_reader)


In [173]:
print(text_etf_daily_df[0])

Corporación Inmobiliaria Vesta (<a href="https://www.marketbeat.com/stocks/NYSE/VTMX/">NYSE:VTMX</a> – <a href="https://www.marketbeat.com/arnreports/ReportTickerOptin.aspx?RegistrationCode=TickerHyperlink&amp;Prefix=NYSE&amp;Symbol=VTMX" style="font-weight:normal;text-decoration:italic;color:green">Get Free Report</a>) and LuxUrban Hotels (<a href="https://www.marketbeat.com/stocks/NASDAQ/LUXH/">NASDAQ:LUXH</a> – <a href="https://www.marketbeat.com/arnreports/ReportTickerOptin.aspx?RegistrationCode=TickerHyperlink&amp;Prefix=NYSE&amp;Symbol=VTMX" style="font-weight:normal;text-decoration:italic;color:green">Get Free Report</a>) are both finance companies, but which is the superior stock? We will compare the two companies based on the strength of their dividends, risk, profitability, institutional ownership, earnings, analyst recommendations and valuation. 
This table compares Corporación Inmobiliaria Vesta and LuxUrban Hotels’ net margins, return on equity and return on assets.
This t

In [175]:
guardian_df= df.filter(pl.col("SOURCEURLS").str.contains("guardian"))
guardian_df

DATE,NUMARTS,COUNTS,THEMES,LOCATIONS,PERSONS,ORGANIZATIONS,TONE,CAMEOEVENTIDS,SOURCES,SOURCEURLS
i64,i64,str,str,str,str,str,str,str,str,str
20240816,4,,"""NATURAL_DISASTER;NATURAL_DISAS…","""1#Puerto Rico#RQ#RQ#18.2359#-6…","""alejandro granadillo""","""national hurricane centre in m…","""-4.98442367601246,1.2461059190…",,"""greenocktelegraph.co.uk;wharfe…","""https://www.greenocktelegraph.…"
20240816,1,,"""LEADER;TAX_FNCACT;TAX_FNCACT_M…","""4#Winsford, Somerset, United K…","""cllr wayne fletcher""","""compass minerals;winsford town…","""1.99004975124378,1.99004975124…",,"""northwichguardian.co.uk""","""https://www.northwichguardian.…"
20240816,1,,"""GEN_HOLIDAY;TAX_FNCACT;TAX_FNC…","""4#Morecambe Bay, Cumbria, Unit…",,,"""3.6036036036036,4.054054054054…",,"""lancasterguardian.co.uk""","""https://www.lancasterguardian.…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_AUTHOR;T…","""1#Australia#AS#AS#-25#135#AS;4…","""megan doherty;kelly canby;emma…","""university of canberra emma ja…","""3.52220520673813,5.28330781010…",,"""mudgeeguardian.com.au""","""https://www.mudgeeguardian.com…"
20240816,1,"""KILL#100##2#Rhode Island, Unit…","""CRISISLEX_CRISISLEXREC;CRISISL…","""2#Rhode Island, United States#…","""jack russell jack russell;bret…","""instagram;family fund""","""0.553505535055351,4.2435424354…",,"""northwichguardian.co.uk""","""https://www.northwichguardian.…"
…,…,…,…,…,…,…,…,…,…,…
20240816,1,,"""SOC_GENERALCRIME;EPU_CATS_MIGR…","""4#Llandeilo, Carmarthenshire, …",,"""dyfed-powys police;facebook""","""-2.94117647058823,1.1764705882…","""1192919590""","""southwalesguardian.co.uk""","""https://www.southwalesguardian…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_WOMEN;MA…","""4#Port-Of-Spain, Port-Of-Spain…","""mary main""","""bank lim;st jude school""","""0.0679809653297078,0.951733514…",,"""guardian.co.tt""","""https://www.guardian.co.tt/new…"
20240816,1,,"""TAX_FNCACT;TAX_FNCACT_GUARD;SO…","""4#Hiroshima, Hiroshima, Japan#…","""linden blue""","""atomics aeronautical systems i…","""0.819672131147541,3.6885245901…",,"""seapowermagazine.org""","""https://seapowermagazine.org/j…"
20240816,2,,"""PROTEST;EXTREMISM;TAX_FNCACT;T…","""4#London, London, City Of, Uni…","""rex william henry clark""","""westminster magistrate court""","""-9.4017094017094,0.42735042735…","""1193035526,1193035527,11930362…","""wiltshiretimes.co.uk;wimbledon…","""https://www.wiltshiretimes.co.…"


In [None]:
text_guardian_df=guardian_df["SOURCEURLS"].head(10).map_elements(multi_reader)


In [178]:
text_guardian_df[0]



In [3]:
import re

def remove_html_tags_and_content(text: str)->str:
    """
    Remove HTML tags and their content from a given string.
    
    Args:
    text (str): The input string containing HTML content.
    
    Returns:
    str: The string with HTML tags and their content removed.
    """
    pattern = re.compile(r'<[^>]*>.*?</[^>]*>', re.DOTALL)
    clean_text = re.sub(pattern, '', text)
    return clean_text

input_string = 'Here is an example <a href="https://www.marketbeat.com/stocks/NASDAQ/EXC/" rel="noopener" target="_blank">EXC stock</a> in the text.'
cleaned_string = remove_html_tags_and_content(input_string)
print(cleaned_string)



Here is an example  in the text.
