In [1]:
import os
import sys
import pandas as pd
from tqdm import tqdm
import requests
from sqlalchemy.orm import Session
from dotenv import load_dotenv

load_dotenv()
News_API_KEYS = os.environ.get('News_API_KEYS')
News_API_KEYS = News_API_KEYS.split(',')
News_API_KEYS = [key.strip() for key in News_API_KEYS]

# modify sys.path for it to contain the main repo path so we can import modules such as below
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from db.db_conn import engine, session_scope, ping_db
from db.models import DimensionOrganization, DimensionNews

In [2]:
ping_db(engine)

True

In [3]:
def get_organisation(organisation: str, next_api_key_index: int) -> pd.DataFrame:
    if organisation is None:
        raise ValueError('No organisation provided.')
    
    while next_api_key_index < len(News_API_KEYS):
        try:
            # Define the API endpoint and parameters
            url = "https://api.thenewsapi.com/v1/news/all"
           
            params = {
            'api_token': News_API_KEYS[next_api_key_index],  # Use the current API key
            'language': 'en',
            'search': f'"{organisation}"',
            'search_fields': ["title", "description"],
            'categories': ["business"],
            'published_after': '2024-01-01'
            }

            # Make the GET request
            response = requests.get(url, params=params)
            response.raise_for_status()  # Raises an error for bad responses
            news_data = response.json().get('data', [])
            
            return pd.DataFrame(news_data), next_api_key_index

        except Exception as e:
            if response.status_code == 402:
                print(f"API rate limit exceeded for key {next_api_key_index}. Switching to next API key.")
                next_api_key_index += 1
            else:
                print(f"Could not get news for {organisation} due to error: {e}")
                break

    return pd.DataFrame(), next_api_key_index  # Return empty DataFrame on error
    
def collect_news_data(session):
    news_data = []
    next_api_key_index = 0

    try:
        # Retrieve all organizations
        records = session.query(DimensionOrganization).all()
        for record in tqdm(records, total=len(records), desc="Collecting news data..."):
            organization_name = record.organization_name
            news_df, next_api_key_index = get_organisation(organization_name, next_api_key_index)

            # Ensure we have data to insert
            if not news_df.empty:
                for _, row in news_df.iterrows():
                    # Append each news item as a dictionary to the list, including uuid
                    news_data.append({
                        "organization_id": record.organization_id,  # Using organization_id for relationship
                        "uuid": row.get("uuid"),  # Add uuid field here
                        "title": row.get("title"),
                        "description": row.get("description"),
                        "keywords": row.get("keywords"),
                        "snippet": row.get("snippet"),
                        "url": row.get("url"),
                        "image_url": row.get("image_url"),
                        "language": row.get("language"),
                        "published_at": row.get("published_at"),
                        "source": row.get("source"),
                        "categories": row.get("categories")
                    })
    
    except Exception as e:
        print(e)
        print("Error occurred while collecting news data.")

    # Convert the list of dictionaries to a DataFrame
    news_df_final = pd.DataFrame(news_data)
    return news_df_final

def add_news_data_to_db(news_df, session: Session):
    try:
        # Iterate over each row in the DataFrame and insert into DimensionNews table if not a duplicate
        for _, row in tqdm(news_df.iterrows()):
            # Check if the news entry already exists using the uuid
            existing_entry = session.query(DimensionNews).filter_by(uuid=row.get("uuid")).first()

            # Only add the entry if it does not already exist
            if not existing_entry:
                news_entry = DimensionNews(
                    organization_id=row["organization_id"],
                    uuid=row.get("uuid"),  # Storing uuid from API
                    title=row.get("title"),
                    description=row.get("description"),
                    keywords=row.get("keywords"),
                    snippet=row.get("snippet"),
                    url=row.get("url"),
                    image_url=row.get("image_url"),
                    language=row.get("language"),
                    published_at=row.get("published_at"),
                    source=row.get("source"),
                    categories=row.get("categories")
                )
                session.add(news_entry)  # Add each news item to the session

        # Commit all entries at once after adding them to the session
        session.commit()
        print("News data added to DimensionNews table successfully.")

    except Exception as e:
        session.rollback()  # Rollback in case of error
        print(f"Error occurred while adding news data to database: {e}")

In [4]:
# Collect News Data
with session_scope() as session:
    news_df = collect_news_data(session)

Collecting news data...:   5%|▌         | 100/1890 [00:39<08:42,  3.43it/s]

API rate limit exceeded for key 0. Switching to next API key.


Collecting news data...:  11%|█         | 200/1890 [01:14<08:41,  3.24it/s]

API rate limit exceeded for key 1. Switching to next API key.


Collecting news data...:  16%|█▌        | 300/1890 [01:55<09:04,  2.92it/s]

API rate limit exceeded for key 2. Switching to next API key.


Collecting news data...:  21%|██        | 400/1890 [02:40<07:49,  3.17it/s]

API rate limit exceeded for key 3. Switching to next API key.


Collecting news data...:  26%|██▋       | 500/1890 [03:17<11:05,  2.09it/s]

API rate limit exceeded for key 4. Switching to next API key.


Collecting news data...:  32%|███▏      | 600/1890 [04:26<13:06,  1.64it/s]

API rate limit exceeded for key 5. Switching to next API key.


Collecting news data...:  37%|███▋      | 700/1890 [05:02<06:57,  2.85it/s]

API rate limit exceeded for key 6. Switching to next API key.


Collecting news data...: 100%|██████████| 1890/1890 [05:38<00:00,  5.59it/s]

API rate limit exceeded for key 7. Switching to next API key.





In [9]:
# Display or print the DataFrame for verification
print(news_df.info())
news_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   organization_id  317 non-null    int64 
 1   uuid             317 non-null    object
 2   title            317 non-null    object
 3   description      317 non-null    object
 4   keywords         317 non-null    object
 5   snippet          317 non-null    object
 6   url              317 non-null    object
 7   image_url        317 non-null    object
 8   language         317 non-null    object
 9   published_at     317 non-null    object
 10  source           317 non-null    object
 11  categories       317 non-null    object
dtypes: int64(1), object(11)
memory usage: 29.8+ KB
None


Unnamed: 0,organization_id,uuid,title,description,keywords,snippet,url,image_url,language,published_at,source,categories
0,20,7b14fcda-d4f2-4014-bd3d-d13bbffa6e80,"Conway, Opto to Bring Vintage Private Markets ...",Opto Investments and Conway Investment Solutio...,,Conway Investment Solutions and private market...,https://www.wealthmanagement.com/alternative-i...,https://www.wealthmanagement.com/sites/wealthm...,en,2024-08-15T09:45:00.000000Z,wealthmanagement.com,[business]
1,20,3f9665b2-a17c-472e-8003-8aaecd18d22a,Mercer Expands Private Market Access with Laun...,Mercer built the Aspen Partners platform for q...,,"Mercer Advisors, a wealth management and finan...",https://www.wealthmanagement.com/alternative-i...,https://www.wealthmanagement.com/sites/wealthm...,en,2024-06-03T13:34:00.000000Z,wealthmanagement.com,[business]
2,20,ea1ef67a-77f8-4431-8980-e904bad7e0df,Polymatech Enters MENA with $16.2-million Bahr...,"Polymatech designs, manufactures, packages, an...","Polymatech, Bahrain, MENA, investment, microel...",Chennai-based opto-semiconductor chip maker Po...,https://economictimes.indiatimes.com/tech/fund...,"https://img.etimg.com/thumb/msid-113332826,wid...",en,2024-09-14T00:31:00.000000Z,economictimes.indiatimes.com,"[tech, business, general]"
3,28,73e54ac1-0e45-4fb9-acc1-3d39ac01f739,"AI-Powered Healthcare Apps: Benefits, Challeng...",This article offers unique perspectives on the...,Hemant Madaan,Hemant Madaan is CEO of JumpGrowth with 20+ ye...,https://www.forbes.com/sites/forbestechcouncil...,https://imageio.forbes.com/specials-images/ima...,en,2024-08-05T10:30:00.000000Z,forbes.com,"[tech, general, business]"
4,51,3aa857ca-9233-44cb-b69f-74fb2cf1854e,Munich-based Reverion raises €56 million for s...,"Reverion, a company building reversible, carbo...",,"Reverion, a company building reversible, carbo...",https://www.eu-startups.com/2024/09/munich-bas...,https://www.eu-startups.com/wp-content/uploads...,en,2024-09-13T07:23:41.000000Z,eu-startups.com,"[business, tech]"


In [10]:
# Populate News to new fact table
with session_scope() as session:
    add_news_data_to_db(news_df, session)

News data added to DimensionNews table successfully.
