In [2]:
import streamlit as st
import feedparser
#force Python to skip SSL certificate verification for HTTPS connections
import ssl
if hasattr(ssl, '_create_unverified_context'):
    ssl._create_default_https_context = ssl._create_unverified_context

In [4]:
'''
function takes a list of websites as input and looks at all useful key value pairs for all news articles on all websites provided
useful keys are:
'title', 'summary', 'content' as these are the most likely locations to find the key words related to accidents
'link' to output the direct access to the article that is found to be related to an accident
'published' to make sure the date of the article is current/output the date to the user
function outputs "articles" (a list of dictionaries where each dictionary contains the relevant information for one article)
'''
def extract_articles(websites: list):
    articles = [] #list with information of all articles
    for website in websites:
        feed = feedparser.parse(website) #feed rss data in
        for entry in feed.entries:
            article_data = {} #dict for current article

            for key in ("title", "summary", "link", "published"): #loop through the important keys and add these key/value pairs to our current article dict
                if key in entry:
                    article_data[key] = entry[key]

            if "content" in entry and isinstance(entry["content"], list): #make sure content key/value info is available for this article and that its a list. also make sure content has "value" inside since thats where the content informaiton is in the content
                content_item = entry["content"][0]
                if "value" in content_item:
                    article_data["content"] = content_item["value"]

            articles.append(article_data)
    
    return articles
    

'''
function takes "articles" (list of dictionaries where each dictionary contains the relevant information for one article) and "keywords" (list of keywords we want to find articles about) as input
outputs "relevant_articles" list of articles that contain at least one of the provided keywords
'''
def get_relevant_articles(articles: list, keywords: list):   
    relevant_articles = {}
    count = 1
    for article in articles:
        for key, value in article.items(): #for each key value pair for each article, we confirm that the value is a string and then check if any of the keywords are in the value
            if isinstance(value, str) and any(keyword.lower() in value.lower() for keyword in keywords):#if it is we take the article title, link, and publish time and put it in the dict relevant_articles which is the output
                title = article.get('title')
                link = article.get('link')
                published = article.get('published')
                relevant_articles[count] = {'Article Title': title, 'Article Link': link, 'Date and Time Published': published}
                count +=1
                break
            
    return relevant_articles






get_relevant_articles(extract_articles(['https://feeds.nbcnews.com/nbcnews/public/news',"http://rss.cnn.com/rss/cnn_us.rss"]), ["", "fire"])


# # Use this code to see all of the articles
# def view_all(website: str):
#     article_dictionary = feedparser.parse(website) #load in rss news feed site and put all information in dict
#     for i in article_dictionary.entries:
#         print("\n--------------")
#         for key, value in i.items():
#             print(f"{key}: {value}")

# view_all('http://rss.cnn.com/rss/cnn_us.rss')





{1: {'Article Title': '‘What is left to occupy?’: Displaced Gazans question Netanyahu’s expansion plan',
  'Article Link': 'https://www.nbcnews.com/video/displaced-gazans-react-to-netanyahu-occupation-plan-244529733943',
  'Date and Time Published': 'Thu, 07 Aug 2025 18:35:49 GMT'},
 2: {'Article Title': 'Trump Says US Will Be ‘Very Rich’ as Tariffs Go Into Effect',
  'Article Link': 'https://www.today.com/video/trump-s-broad-tariffs-take-effect-against-nearly-100-countries-244505157738',
  'Date and Time Published': 'Thu, 07 Aug 2025 11:24:04 GMT'},
 3: {'Article Title': 'Trump demands resignation of Intel CEO over alleged China ties',
  'Article Link': 'https://www.nbcnews.com/business/business-news/trump-demands-intel-ceo-resign-rcna223594',
  'Date and Time Published': 'Thu, 07 Aug 2025 13:11:17 GMT'},
 4: {'Article Title': "India's Modi says he is ready to 'pay a big price' in the face of U.S. tariffs",
  'Article Link': 'https://www.nbcnews.com/politics/trump-administration/live-

In [None]:
st.set_page_config(page_title="Incident Feed", layout="wide") #set title that is shown in browser tab

st.title("Lab & Environmental Emergency News Monitor") #set title shown at top of webpage

with st.sidebar: #sidebar
    st.header("User Inputs:") #header for the sidebar
    
    rss_input = st.text_area("RSS Feed URLs (one per line)",  #creates a field for the web feed url inputs with a couple links put in by default for the user 
        value="""https://feeds.nbcnews.com/nbcnews/public/news
http://rss.cnn.com/rss/cnn_us.rss""")
    
    keyword_input = st.text_input("Desired Keywords (comma-separated)", value="lab, fire, explosion, chemical, environmental") #creates a field for the user to input keywords, with a list of default words

    run_search = st.button("Run News Scan") #creates a button with text telling the user what the button does

if run_search: #if the user clicks the button
    rss_feeds = [url.strip() for url in rss_input.strip().splitlines() if url.strip()] #we want to take all the text in the rss_feeds input box, and process them into a list of URLs
    keywords = [kw.strip().lower() for kw in keyword_input.split(",") if kw.strip()] #take all the text in keyword_input input box, split them by comma, if keyword.strip is not empty, append the lowercased keyword
    
    with st.spinner("Scanning feeds for relevant articles..."): # when user hits run button give this loading text
        articles = extract_articles(rss_feeds) #then run the two functions to get the relevant articles
        filtered_articles = get_relevant_articles(articles, keywords)

    st.subheader(f"Found {len(filtered_articles)} relevant articles!") #once the above has run, we output the text with the number of articles found

    for counter, article in filtered_articles.items(): #loop through the filtered_articles dictionary from our function
        st.markdown(f"### {counter}. {article['Article Title']}") #output the counter number (key) and the portion of the value that contains each info we want
        st.markdown(f"**Published:** {article['Date and Time Published']}")
        st.markdown(f"[Read Article]({article['Article Link']})") #create a hyperlink, user sees the text inside [] and text in () is the link
        st.markdown("---") #divider

In [None]:
def get_relevant_articles(articles: list, keywords: list):
    relevant_articles = {}
    count = 1

    keyword_patterns = {
        keyword: re.compile(rf'\b{re.escape(keyword)}\b', re.IGNORECASE)
        for keyword in keywords
    }

    for article in articles:
        matched_keywords = set()

        for key, value in article.items():
            if isinstance(value, str):
                for keyword, pattern in keyword_patterns.items():
                    if pattern.search(value):
                        matched_keywords.add(keyword)

        if matched_keywords:
            relevant_articles[count] = {
                'Article Title': article.get('title'),
                'Article Link': article.get('link'),
                'Date and Time Published': article.get('published'),
                'Matched Keywords': sorted(matched_keywords)  # sorted for consistency
            }
            count += 1

    return relevant_articles


In [6]:
import re

In [18]:
def replace_tag_with_boundary(match, text): 
    before = text[:match.start()] 
    if re.search(r'[.!?]"?\s*$', before): 
        return ' '
    else:
        return '. '
    

text = '''</p><p>"As the authors point out, the actual stevia plant does not seem to have any benefit for stopping cancer, so they had to use a chemical process to change the plant and make it stronger with a fermentation process."</p><p>Oberstein recommended approaching this with caution, as it is unknown whether altering the plant will lead to side effects or toxicity.</p><p>"'''


text = re.sub(      
    r'(</p>|<br\s*/?>|</div>)',
    lambda m: replace_tag_with_boundary(m, text),
    text,
    flags=re.IGNORECASE
)
cleaned_value = re.sub(r'<[^>]+>', ' ', text)
cleaned_value = re.sub(r'\s+', ' ', cleaned_value).strip()


print(re.split(r'(?<=[.!?])"?(?=\s+)', cleaned_value) )


print(re.split(r'(?<=[.!?])(?=\s+"?)', cleaned_value))

print(re.split(r'(?<=[.!?]")(?=\s+)|(?<=[.!?])(?=\s+)', cleaned_value))



['.', ' "As the authors point out, the actual stevia plant does not seem to have any benefit for stopping cancer, so they had to use a chemical process to change the plant and make it stronger with a fermentation process.', ' Oberstein recommended approaching this with caution, as it is unknown whether altering the plant will lead to side effects or toxicity.', ' "']
['.', ' "As the authors point out, the actual stevia plant does not seem to have any benefit for stopping cancer, so they had to use a chemical process to change the plant and make it stronger with a fermentation process." Oberstein recommended approaching this with caution, as it is unknown whether altering the plant will lead to side effects or toxicity.', ' "']
['.', ' "As the authors point out, the actual stevia plant does not seem to have any benefit for stopping cancer, so they had to use a chemical process to change the plant and make it stronger with a fermentation process."', ' Oberstein recommended approaching th

In [2]:
import sqlite3

conn = sqlite3.connect("articles.db")
c = conn.cursor()
c.execute("SELECT * FROM articles LIMIT 10")  # show first 10 rows
rows = c.fetchall()
for row in rows:
    print(row)
conn.close()

(1, 'Brazoria Co. officials issue an all-clear after gas pipeline rupture in Hillcrest Village near Alvin', 'https://abc13.com/post/crews-responding-natural-gas-pipeline-rupture-hillcrest-village-alvin-brazoria-county-officials-say/17597340/', '2025-08-20 23:56:22', 'gas leak', 'Brazoria County officials issued an all-clear after they said a natural **gas leak** was contained in Hillcrest Village near Alvin on Wednesday.')
(2, 'All lanes open after multi-vehicle crash involving 18-wheelers caused fuel and oil spill on IH-10', 'https://www.12newsnow.com/article/traffic/multi-vehicle-crash-involving-18-wheelers-causes-fuel-oil-spill-shuts-down-ih-10-westbound/502-3a587fc6-89d9-4e8f-bb8a-0aca6540ca8b', '2025-08-18 03:41:10', 'oil spill', 'Keyword found in article title and/or URL.')
(3, 'Baltimore shipping channel reopens after explosion on coal ship', 'https://www.wmar2news.com/news/region/baltimore-city/baltimore-shipping-channel-reopens-after-explosion-on-coal-ship', '2025-08-19 20:36:

In [1]:
import sqlite3

conn = sqlite3.connect("articles.db")
c = conn.cursor()
c.execute("SELECT * FROM articles")
rows = c.fetchall()
for row in rows:
    print(row)
conn.close()

(1, 'NTSB releases preliminary report into train derailment that seriously injured conductor at Denver railyard', 'https://www.denver7.com/news/front-range/denver/ntsb-releases-preliminary-report-into-train-derailment-that-seriously-injured-conductor-at-denver-railyard', '2025-09-02 22:05:33', 'Train Derailment', 'DENVER The National Transportation Safety Board (NTSB) released its preliminary report into a **train derailment** that seriously injured a conductor at a BNSF railyard in Denver last month. \n\n The National Transportation Safety Board (NTSB) released its preliminary report into a **train derailment** that seriously injured a conductor at a BNSF railyard in Denver last month.')
(2, "Company to store 'hazardous materials' in St. Charles' wellhead district", 'https://www.stltoday.com/news/local/stcharles/article_5e00d472-3d71-480e-b409-8eb5f1c028ab.html', '2025-09-03 07:10:00', 'Hazardous Materials', 'Keyword found in article title and/or URL.')
(3, "EPA: Nearly 2.5 million ga

In [6]:
import sqlite3

conn = sqlite3.connect("articles.db")
c = conn.cursor()

# Example: delete one row by ID
# article_id = 9
# c.execute("DELETE FROM articles WHERE id = ?", (article_id,))
c.execute("DELETE FROM articles WHERE id BETWEEN ? AND ?", (40, 63))

conn.commit()  # Save changes
conn.close()

In [4]:
import os
data_path= "./input.txt"
lbl_path = data_path.replace("input", "labels")
print(lbl_path)

./labels.txt


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Read sentences from input.txt
sentences = []
with open("model_training/input.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:  # skip empty lines
            sentences.append(line)

# Convert sentences to bag-of-words vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences).toarray()

print(X)
print("Shape:", X.shape)  # (num_sentences, vocab_size)
print(X[0])


FileNotFoundError: [Errno 2] No such file or directory: 'model_training/input.txt'