In [5]:
import os
import sys
import pandas as pd
from tqdm import tqdm
import requests
from sqlalchemy.orm import Session
from dotenv import load_dotenv

load_dotenv()
News_API_KEYS = os.environ.get('News_API_KEYS')
News_API_KEYS = News_API_KEYS.split(',')
News_API_KEYS = [key.strip() for key in News_API_KEYS]

# modify sys.path for it to contain the main repo path so we can import modules such as below
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from db.db_conn import engine, session_scope, ping_db
from db.models import DimensionOrganization, DimensionNews

In [2]:
ping_db(engine)

True

In [8]:
def get_organisation(organisation: str, next_api_key_index: int) -> pd.DataFrame:
    if organisation is None:
        raise ValueError('No organisation provided.')
    
    while next_api_key_index < len(News_API_KEYS):
        try:
            # Define the API endpoint and parameters
            url = "https://api.thenewsapi.com/v1/news/all"
           
            params = {
            'api_token': News_API_KEYS[next_api_key_index],  # Use the current API key
            'language': 'en',
            'search': f'"{organisation}"',
            'search_fields': ["title", "description"],
            'categories': ["business"],
            'published_after': '2022-01-01'
            }


            # Make the GET request
            response = requests.get(url, params=params)
            response.raise_for_status()  # Raises an error for bad responses
            news_data = response.json().get('data', [])
            
            return pd.DataFrame(news_data), next_api_key_index

        except Exception as e:
            if response.status_code == 402:
                print(f"API rate limit exceeded for key {next_api_key_index}. Switching to next API key.")
                next_api_key_index += 1
            else:
                print(f"Could not get news for {organisation} due to error: {e}")
                break

    return pd.DataFrame(), next_api_key_index  # Return empty DataFrame on error
    
def collect_news_data(session):
    news_data = []
    next_api_key_index = 0

    try:
        # Retrieve all organizations
        records = session.query(DimensionOrganization).all()
        for record in tqdm(records, total=len(records), desc="Collecting news data..."):
            organization_name = record.organization_name
            news_df, next_api_key_index = get_organisation(organization_name, next_api_key_index)

            # Ensure we have data to insert
            if not news_df.empty:
                for _, row in news_df.iterrows():
                    # Append each news item as a dictionary to the list, including uuid
                    news_data.append({
                        "organization_id": record.organization_id,  # Using organization_id for relationship
                        "uuid": row.get("uuid"),  # Add uuid field here
                        "title": row.get("title"),
                        "description": row.get("description"),
                        "keywords": row.get("keywords"),
                        "snippet": row.get("snippet"),
                        "url": row.get("url"),
                        "image_url": row.get("image_url"),
                        "language": row.get("language"),
                        "published_at": row.get("published_at"),
                        "source": row.get("source"),
                        "categories": row.get("categories")
                    })
    
    except Exception as e:
        print(e)
        print("Error occurred while collecting news data.")

    # Convert the list of dictionaries to a DataFrame
    news_df_final = pd.DataFrame(news_data)
    return news_df_final

def add_news_data_to_db(news_df, session: Session):
    try:
        # Iterate over each row in the DataFrame and insert into DimensionNews table if not a duplicate
        for _, row in news_df.iterrows():
            # Check if the news entry already exists using the uuid
            existing_entry = session.query(DimensionNews).filter_by(uuid=row.get("uuid")).first()

            # Only add the entry if it does not already exist
            if not existing_entry:
                news_entry = DimensionNews(
                    organization_id=row["organization_id"],
                    uuid=row.get("uuid"),  # Storing uuid from API
                    title=row.get("title"),
                    description=row.get("description"),
                    keywords=row.get("keywords"),
                    snippet=row.get("snippet"),
                    url=row.get("url"),
                    image_url=row.get("image_url"),
                    language=row.get("language"),
                    published_at=row.get("published_at"),
                    source=row.get("source"),
                    categories=row.get("categories")
                )
                session.add(news_entry)  # Add each news item to the session

        # Commit all entries at once after adding them to the session
        session.commit()
        print("News data added to DimensionNews table successfully.")

    except Exception as e:
        session.rollback()  # Rollback in case of error
        print(f"Error occurred while adding news data to database: {e}")

In [9]:
# Collect News Data
with session_scope() as session:
    news_df = collect_news_data(session)

Collecting news data...:   2%|▏         | 40/2593 [00:23<24:13,  1.76it/s]

API rate limit exceeded for key 0. Switching to next API key.


Collecting news data...:   5%|▌         | 140/2593 [01:24<25:16,  1.62it/s]

API rate limit exceeded for key 1. Switching to next API key.


Collecting news data...:   9%|▉         | 240/2593 [02:21<14:43,  2.66it/s]

API rate limit exceeded for key 2. Switching to next API key.


Collecting news data...:  13%|█▎        | 340/2593 [03:26<18:34,  2.02it/s]  

API rate limit exceeded for key 3. Switching to next API key.


Collecting news data...:  17%|█▋        | 440/2593 [04:26<21:22,  1.68it/s]

API rate limit exceeded for key 4. Switching to next API key.


Collecting news data...:  21%|██        | 540/2593 [05:28<25:24,  1.35it/s]

API rate limit exceeded for key 5. Switching to next API key.


Collecting news data...:  25%|██▍       | 640/2593 [06:18<13:34,  2.40it/s]

API rate limit exceeded for key 6. Switching to next API key.


Collecting news data...: 100%|██████████| 2593/2593 [07:16<00:00,  5.94it/s]  

API rate limit exceeded for key 7. Switching to next API key.





In [11]:
# Display or print the DataFrame for verification
news_df.head()

Unnamed: 0,organization_id,uuid,title,description,keywords,snippet,url,image_url,language,published_at,source,categories
0,69,418c9b39-ff7e-46dd-9a2b-13b08a47c88e,"Invert, Always Invert","Invert, Always Invert \r\n 05 May 2...",,"Background\n\nMunger, 99, is Buffett’s long-ti...",https://www.theinvestorspodcast.com/newsletter...,https://www.theinvestorspodcast.com/wp-content...,en,2023-05-08T08:02:08.000000Z,theinvestorspodcast.com,[business]
1,69,bb9c4f7d-99a9-4557-b39e-6264480b0c90,Invert Adds Further Carbon Credit Experience a...,"OTTAWA, Ontario — Invert Inc. (“Invert”), a s...",,"This advertisement has not loaded yet, but you...",https://financialpost.com/pmn/press-releases-p...,https://storage.googleapis.com/pmd-stage-north...,en,2022-03-03T13:24:04.000000Z,financialpost.com,"[business, general]"
2,69,f9f7b226-4073-4695-a298-9a0b65cacb4c,There's nothing stopping the 10-year Treasury ...,The Treasury yield curve looks on the path to ...,,There's nothing stopping bond yields from cont...,https://markets.businessinsider.com/news/bonds...,https://i.insider.com/641de48623738f0018bb0765...,en,2023-10-24T18:33:36.000000Z,businessinsider.com,"[business, tech]"
3,192,4a599c5e-3f4f-416a-8167-f94ce6edfc3b,Romania annuls election after alleged Russian ...,Top court orders rerun of presidential vote,,"Print this page\n\nRoula Khalaf, Editor of the...",https://www.ft.com/content/2248c05d-5536-4525-...,https://www.ft.com/__origami/service/image/v2/...,en,2024-12-06T14:11:14.000000Z,ft.com,"[general, business]"
4,192,02ae5916-0284-4b35-b675-b5ae734288c5,Goldman’s big bet,Nigerian opposition parties call for an electi...,,This is an audio transcript of the FT News Bri...,https://www.ft.com/content/f8e17310-485e-4554-...,https://www.ft.com/__origami/service/image/v2/...,en,2023-03-01T05:17:51.000000Z,ft.com,"[general, business]"


In [12]:
# Populate News to new fact table
with session_scope() as session:
    add_news_data_to_db(news_df, session)

News data added to DimensionNews table successfully.
