In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!pip install folium pandas ipyleaflet geopy

In [None]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import time
import spacy
import geopy
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import datetime as dt

def clean_url(searched_items, data_filter):
    """
    OUTPUT : url to be fetched for the searched_item and data_filter
    ---------------------------------------------------
    Parameters:
      today' - get headlines of the news that are released only in today
      'this_week' - get headlines of the news that are released in this week
      'this_month' - news released in this month
      'this_year' - news released in this year
      number : int/str input for number of days ago
      or '' blank to get all data
    """
    x = dt.datetime.today()
    today = str(x)[:10]
    yesterday = str(x + pd.Timedelta(days=-1))[:10]
    this_week = str(x + pd.Timedelta(days=-7))[:10]
    if data_filter == 'today':
        time = 'after%3A' + yesterday
    elif data_filter == 'this_week':
        time = 'after%3A' + this_week + '+before%3A' + today
    elif data_filter == 'this_year':
        time = 'after%3A' + str(x.year - 1)
    elif str(data_filter).isdigit():
        temp_time = str(x + pd.Timedelta(days=-int(data_filter)))[:10]
        time = 'after%3A' + temp_time + '+before%3A' + today
    else:
        time = ''

    # Construct the query with multiple keywords
    query = '+'.join(searched_items)
    url = f'https://news.google.com/rss/search?q={query}+' + time + '&hl=en-US&gl=US&ceid=US%3Aen'
    return url

def extract_locations(text):
    """
    Extracts location entities from the given text using spaCy's NER.
    """
    # Load English tokenizer, tagger, parser, NER, and word vectors
    nlp = spacy.load("en_core_web_sm")

    # Process the text with spaCy
    doc = nlp(text)

    # Extract location entities
    locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE' and ent.text.lower() not in ['openai', 'genai']]

    return locations

def geocode_location(location):
    """
    Geocodes the given location to get latitude and longitude coordinates.
    """
    geolocator = Nominatim(user_agent="news_app")  # You can change the user_agent
    try:
        location_info = geolocator.geocode(location)
        latitude = location_info.latitude
        longitude = location_info.longitude
    except:
        latitude = "NA"
        longitude = "NA"
    return latitude, longitude

def get_news_for_keywords(keywords, data_filter=None):
    """
    Search through Google News with the "keywords" and get the headlines
    and the contents of the news that was released today, this week, this month,
    or this year ("date_filter").
    """

    url = clean_url(keywords, data_filter)
    response = requests.get(url)
    # get the root directly as we have text file of string now
    root = ET.fromstring(response.text)

    # Filter news articles containing all keywords
    all_keywords_found = []
    for item in root.findall('.//channel/item'):
        title = item.find('title').text.lower()
        if all(keyword.lower() in title for keyword in keywords):
            all_keywords_found.append(item)

    # get the required data
    title = [i.find('title').text for i in all_keywords_found]
    link = [i.find('link').text for i in all_keywords_found]
    pubDate = [i.find('pubDate').text for i in all_keywords_found]
    source = [i.find('.//source').text for i in all_keywords_found]

    # Extract locations from the descriptions
    locations = []
    latitudes = []
    longitudes = []
    for i in all_keywords_found:
        # Get the full content of the news article from the source link
        article_response = requests.get(i.find('link').text)
        article_html = article_response.text

        # Extract text content from HTML using BeautifulSoup
        soup = BeautifulSoup(article_html, 'html.parser')
        article_text = ' '.join([p.text for p in soup.find_all('p')])

        # Extract locations from the entire article content
        article_locations = extract_locations(article_text)
        locations.append(','.join(article_locations) if article_locations else 'NA')

        if article_locations:
        # Geocode only the first location to get latitude and longitude
            latitude, longitude = geocode_location(article_locations[0])
        else:
            latitude, longitude = "NA", "NA"

        # Geocode locations to get latitude and longitude
        latitudes.append(latitude)
        longitudes.append(longitude)

    # set the data frame
    df = pd.DataFrame({'title': title, 'link': link, 'date': pubDate, 'source': source, 'locations': locations, 'latitude': latitudes, 'longitude': longitudes})
    df['keywords'] = ','.join(keywords)  # Add a column to identify the keywords
    # adjust the date column
    df.date = pd.to_datetime(df.date, unit='ns')
    return df

if __name__ == "__main__":
    start = time.time()
    keywords = input('Enter your search terms separated by commas: ').split(',')
    data_filter = "this_month"

    data = get_news_for_keywords(keywords, data_filter)
    print(f"News fetched for keywords: {', '.join(keywords)}")

    data.to_csv('news_data.csv', encoding='utf-8-sig', index=False)
    end = time.time() - start
    print("Total execution time:", end)

Enter your search terms separated by commas: ai, ethical, issues
News fetched for keywords: ai,  ethical,  issues
Total execution time: 28.209688186645508


In [None]:
import folium
import pandas as pd
from bs4 import BeautifulSoup

# Load your dataset
data = pd.read_csv('news_data.csv')

# Create a map centered around an average location of your data
map_center = [data['latitude'].mean(), data['longitude'].mean()]
mymap = folium.Map(location=map_center, zoom_start=10)

# Create a dictionary to store articles by location
articles_by_location = {}

# Iterate through the dataset
for index, row in data.iterrows():
    # Check if latitude and longitude are valid
    if pd.notnull(row['latitude']) and pd.notnull(row['longitude']):
        # Extract the first location from the list
        location = row['locations'].split(',')[0].strip()
        # Check if location is not NA
        if location != 'NA':
            # Add article to the dictionary for the current location
            if location not in articles_by_location:
                articles_by_location[location] = []
            articles_by_location[location].append({'title': row['title'], 'date': row['date'], 'link': row['link']})

# Add circles to the map with customized popups
for location, articles in articles_by_location.items():
    # Geocode location to get latitude and longitude
    try:
        latitude, longitude = geocode_location(location)
        popup_content = f'<div><strong>Location:</strong> {location}<br>'
        # Add articles to the popup content
        for article in articles:
            popup_content += f'''
            <p><strong>Title:</strong> {article["title"]}</p>
            <p><strong>Date:</strong> {article["date"]}</p>
            <p><a href="{article["link"]}" target="_blank">Go to Source</a></p>
            '''
        popup_content += '</div>'
        popup = folium.Popup(popup_content, max_width=300)  # Max width for better display
        folium.CircleMarker(location=[latitude, longitude], radius=10, popup=popup, color='blue', fill=True).add_to(mymap)
    except ValueError as e:
        print(f"Skipping location '{location}': {e}")

# Save the map to an HTML file
html_file = 'map_with_custom_tooltips_clickable.html'
mymap.save(html_file)

# Read the HTML file and parse it using BeautifulSoup
with open(html_file, 'r') as f:
    html_content = f.read()
    soup = BeautifulSoup(html_content, 'html.parser')

# Insert the header content with a white opaque box
header_content = '''
<div style="position: absolute; top: 15px; left: 50%; transform: translateX(-50%); z-index: 1000;">
    <div style="background-color: rgba(255, 255, 255, 0.6); padding: 8px; border-radius: 14px;">
        <h1 style="color: black; font-weight: bold; font-family: Helvetica, sans-serif;font-size: 28px;">Global AI Ethical Issues</h1>
    </div>
</div>
'''
header_tag = soup.new_tag('div')
header_tag.append(BeautifulSoup(header_content, 'html.parser'))
soup.body.insert(0, header_tag)

# Write the updated HTML content back to the file
with open(html_file, 'w') as f:
    f.write(str(soup))
