In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup

In [3]:
url = "https://www.nytimes.com/2024/10/01/world/middleeast/israel-lebanon-invasions-history.html?smid=url-share"
response = requests.get(url)
soup = BeautifulSoup(response.content)

In [4]:
paragraphs = soup.find_all('p')
text = "".join([para.text for para in paragraphs])

In [5]:
print(text)

Please enable JS and disable any ad blocker


In [42]:
import spacy

In [43]:
from spacy import displacy

In [44]:
nlp = spacy.load('en_core_web_sm')

In [45]:
doc = nlp(text)

In [46]:
# for sent in doc.sents:
#     displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [47]:
import numpy as np
A = np.array([[6,0,0],[0,3,0],[0,0,2]])
U,S,Vt = np.linalg.svd(A)

In [48]:
print("Matrix A:")
print(A)
print("\nU (Left singular vectors)")
print(U)
print("\nS (singular values)")
print(S)
print("\nV (Right singular vectors)")
print(Vt)

Matrix A:
[[6 0 0]
 [0 3 0]
 [0 0 2]]

U (Left singular vectors)
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]

S (singular values)
[6. 3. 2.]

V (Right singular vectors)
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [49]:
sigma = np.diag(S)
A_reconstructed = np.dot(U,np.dot(sigma,Vt))

In [50]:
A_reconstructed

array([[6., 0., 0.],
       [0., 3., 0.],
       [0., 0., 2.]])

In [51]:
### can also do
np.matmul(U, np.matmul(sigma, Vt))

array([[6., 0., 0.],
       [0., 3., 0.],
       [0., 0., 2.]])

In [52]:
import sympy as sp

In [53]:
A_sp = sp.Matrix(A)
A_sp

Matrix([
[6, 0, 0],
[0, 3, 0],
[0, 0, 2]])

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np

In [55]:
documents = [
    "Data mining is the process of discovering patterns in large data sets.",
    "Machine learning algorithms can learn from data to make predictions.",
    "Datamining and machine learning are closely related fields."
]

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [57]:
lsa = TruncatedSVD(n_components=2, random_state=42)
X_lsa = lsa.fit_transform(X)

In [58]:
terms = vectorizer.get_feature_names_out()

print("Term Document Matrix (TFIDF)")
print(X.toarray())

print("\nComponents (terms) after SVD")
for i, component in enumerate(lsa.components_):
    term_in_component = zip(terms,component)
    sorted_terms = sorted(term_in_component, key=lambda x: x[1], reverse=True)
    print(f"Component {i}:")
    for term, weight in sorted_terms:
        print(f"{term}: {weight:.4f}")

print("\nDocument Representations in Reduced Space:")
print(X_lsa)

similarity = np.dot(X_lsa, X_lsa.T)
print("\nDocument Similarity Matrix:")
print(similarity)

Term Document Matrix (TFIDF)
[[0.         0.         0.52753275 0.         0.34682109 0.
  0.34682109 0.         0.         0.         0.         0.34682109
  0.34682109 0.         0.34682109 0.         0.34682109]
 [0.41756662 0.         0.31757018 0.         0.         0.
  0.         0.41756662 0.31757018 0.31757018 0.41756662 0.
  0.         0.41756662 0.         0.         0.        ]
 [0.         0.44036207 0.         0.44036207 0.         0.44036207
  0.         0.         0.3349067  0.3349067  0.         0.
  0.         0.         0.         0.44036207 0.        ]]

Components (terms) after SVD
Component 0:
data: 0.4039
learning: 0.3642
machine: 0.3642
algorithms: 0.2619
learn: 0.2619
make: 0.2619
predictions: 0.2619
closely: 0.2170
datamining: 0.2170
fields: 0.2170
related: 0.2170
discovering: 0.1346
large: 0.1346
mining: 0.1346
patterns: 0.1346
process: 0.1346
sets: 0.1346
Component 1:
data: 0.4144
discovering: 0.2725
large: 0.2725
mining: 0.2725
patterns: 0.2725
process: 0.2

In [59]:
from bs4 import BeautifulSoup
import requests
def scrape_wiki_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

In [60]:
url = "https://en.wikipedia.org/wiki/Nvidia"
def extract_text_from_html(html):
    soup = BeautifulSoup(html,'html.parser')
    paragraphs = soup.find_all('p')
    text = '\n'.join([p.get_text() for p in paragraphs])
    return text

In [114]:
def perform_ner(text):
    doc = nlp(text)
    dates = [(ent.text,ent.start_char,ent.end_char) for ent in doc.ents if ent.label_ == 'DATE']
    persons = [(ent.text,ent.start_char,ent.end_char) for ent in doc.ents if ent.label_ == 'PERSON']
    return dates,persons

In [115]:
def extract_events(text,dates):
    events = []
    for date, start, end in dates:
        start_context = max(0,start-50)
        end_context = min(len(text), end+50)
        event_context = text[start_context:end_context]
        return event_text

In [116]:
content = scrape_wiki_page(url)
print(content)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<title>United States Military Academy - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-client

In [117]:
text = extract_text_from_html(content)
text

'\n\nThe United States Military Academy (USMA), also referred to metonymically as West Point or simply as Army,[7] is a United States service academy in West Point, New York. It was originally established as a fort during the American Revolutionary War, as it sits on strategic high ground overlooking the Hudson River 50 miles (80\xa0km) north of New York City. It is the oldest of the five American service academies and educates cadets for commissioning into the United States Army.\n\nCandidates for admission must apply directly to the academy and receive a nomination, usually from a member of Congress. Other nomination sources include the president and vice president.[8] Students are officers-in-training and are referred to as "cadets" or collectively as the "United States Corps of Cadets" (USCC). The Army fully funds tuition for cadets in exchange for an active duty service obligation upon graduation. About 1,300 cadets enter the academy each July, with about 1,000 cadets graduating. 

In [118]:
dates,people = perform_ner(text)
dates,people

([('each July', 945, 954),
  ('winter', 1717, 1723),
  ('the early and mid-20th\xa0century', 1836, 1866),
  ('27 January', 2467, 2477),
  ('Between 1778 and 1780', 2568, 2589),
  ('1801', 4243, 4247),
  ('1802', 4605, 4609),
  ('4 July 1802.[24', 4697, 4712),
  ('October 1802', 4790, 4802),
  ('1812', 4845, 4849),
  ('early years', 4881, 4892),
  ('age from 10 years to', 4980, 5000),
  ('37 years', 5001, 5009),
  ('between 6\xa0months to', 5023, 5042),
  ('1812', 5077, 5081),
  ('1817', 5240, 5244),
  ('two years', 5628, 5637),
  ('the first half of the 19th\xa0century', 5971, 6005),
  ('1824', 6275, 6279),
  ('1835', 6447, 6451),
  ('first year', 6471, 6481),
  ('1835', 6769, 6773),
  ('the 1850s', 7459, 7468),
  ('1868', 8749, 8753),
  ('1870', 8759, 8763),
  ('1877', 9061, 9065),
  ('1880', 9212, 9216),
  ('first 65\xa0years', 9624, 9638),
  ('3 December 1900', 10008, 10023),
  ('the late 20th century', 10434, 10455),
  ('1899', 10590, 10594),
  ('1901', 10676, 10680),
  ('15-year',

In [139]:
def extract_events(text,dates):
    events = []
    for date, start, end in dates:
        start_context = max(0,start-100)
        end_context = min(len(text), end+100)
        event_text = text[start_context:end_context]
        events.append(f"Date: {date}\nEvent: {event_text}")
    return events

In [140]:
def scrape_for_events(url):
    html_content = scrape_wiki_page(url)
    if html_content:
        text_content = extract_text_from_html(html_content)
        dates,persons = perform_ner(text_content)
        events = extract_events(text_content, dates)
        event_string = '\n\n'.join(events)
        print(event_string)
        return event_string
    else:
        return None

In [141]:
scrape_for_events(r"https://en.wikipedia.org/wiki/Large_language_model")

Date: August
Event:  self-supervised and semi-supervised training process.[1]

The largest and most capable LLMs, as of August 2024[update], are artificial neural networks built with a decoder-only transformer-based architectu

Date: 2017
Event: models initially released with Watsonx, Anthropic's Claude models, and Mistral AI's models.

Before 2017, there were a few language models that were large as compared to capacities then available. In the 

Date: the 1990s
Event: 2017, there were a few language models that were large as compared to capacities then available. In the 1990s, the IBM alignment models pioneered statistical language modelling. A smoothed n-gram model in 2001

Date: 2001
Event: 990s, the IBM alignment models pioneered statistical language modelling. A smoothed n-gram model in 2001 trained on 0.3 billion words achieved then-SOTA (state of the art) perplexity.[5] In the 2000s, as 

Date: the 2000s
Event:  model in 2001 trained on 0.3 billion words achieved then-SOTA (sta

'Date: August\nEvent:  self-supervised and semi-supervised training process.[1]\n\nThe largest and most capable LLMs, as of August\xa02024[update], are artificial neural networks built with a decoder-only transformer-based architectu\n\nDate: 2017\nEvent: models initially released with Watsonx, Anthropic\'s Claude models, and Mistral AI\'s models.\n\nBefore 2017, there were a few language models that were large as compared to capacities then available. In the \n\nDate: the 1990s\nEvent: 2017, there were a few language models that were large as compared to capacities then available. In the 1990s, the IBM alignment models pioneered statistical language modelling. A smoothed n-gram model in 2001\n\nDate: 2001\nEvent: 990s, the IBM alignment models pioneered statistical language modelling. A smoothed n-gram model in 2001 trained on 0.3 billion words achieved then-SOTA (state of the art) perplexity.[5] In the 2000s, as \n\nDate: the 2000s\nEvent:  model in 2001 trained on 0.3 billion words 