# Gather News via API

This notebook gathers the most recent week of US tech news via NewsAPI.  
Credentials are stored in a json file outside the code for security reasons.  
Due to the limits on the free plan on NewsApi, I use simple pagination.  
The news are stored in an Azure SQL database.

In [1]:
import pandas as pd
import json
import datetime
from newsapi.newsapi_client import NewsApiClient
from tqdm.notebook import tqdm_notebook
#from tqdm import tqdm
from configparser import ConfigParser
import sqlalchemy
from sqlalchemy import create_engine

## Extract from API

In [None]:
# Load credentials
def load_credentials(path='credentials.json'):
    """Loads client credentials from a specified path."""
    try:
        with open(path, 'r') as file:
            credentials = json.load(file)
        return credentials
    except FileNotFoundError:
        print(f"Error: The file '{path}' does not exist.")
        return None
    except json.JSONDecodeError:
        print(f"Error: The file '{path}' is not a valid JSON.")
        return None

In [None]:
credentials = load_credentials()

In [None]:
api_key1 = credentials['api_key1']

In [4]:
# Get API key from config file
newsapi = NewsApiClient(api_key1)

In [5]:
# Get list of sources
sources = newsapi.get_sources(language='en',country='us',category='technology')

In [6]:
# Unnest json
sources_df = pd.json_normalize(sources,record_path=['sources'])

In [7]:
sources_df.head()

Unnamed: 0,id,name,description,url,category,language,country
0,ars-technica,Ars Technica,The PC enthusiast's resource. Power users and ...,https://arstechnica.com,technology,en,us
1,crypto-coins-news,Crypto Coins News,Providing breaking cryptocurrency news - focus...,https://www.ccn.com,technology,en,us
2,engadget,Engadget,Engadget is a web magazine with obsessive dail...,https://www.engadget.com,technology,en,us
3,hacker-news,Hacker News,Hacker News is a social news website focusing ...,https://news.ycombinator.com,technology,en,us
4,recode,Recode,"Get the latest independent tech news, reviews ...",http://www.recode.net,technology,en,us


In [8]:
# API call parameters
from_date = (datetime.date.today() - datetime.timedelta(days=5)).strftime('%Y-%m-%d')
to_date = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
pages = range(1,6)

In [9]:
# Get 1 page of news
def get_news(p):
    news_dict = {}
    sources = sources_df['id']
    for s in sources:
        response = newsapi.get_everything(sources=s,
                                          page=p,
                                          from_param=from_date,
                                          to=to_date,
                                          language='en',
                                          sort_by='popularity')
        flatten = pd.json_normalize(response,record_path=['articles'])
        news_dict[s] = flatten
    news_df = pd.concat([news_dict[s] for s in sources],ignore_index=True)        
    return news_df

In [10]:
# Get multiple pages of news
def get_news_multiple_pages(pages):
    news_dict = {}
    for p in tqdm_notebook(pages):
        news_dict[p] = get_news(p)
    news_df = pd.concat([news_dict[p] for p in pages],ignore_index=True)
    return news_df

In [11]:
# Call get news function
news_df = get_news_multiple_pages(pages)

  0%|          | 0/5 [00:00<?, ?it/s]

In [12]:
news_df.head()

Unnamed: 0,author,title,description,url,urlToImage,publishedAt,content,source.id,source.name
0,,[Removed],[Removed],https://removed.com,,2025-01-10T16:30:20Z,[Removed],,[Removed]
1,,[Removed],[Removed],https://removed.com,,2025-01-10T19:28:42Z,[Removed],,[Removed]
2,,[Removed],[Removed],https://removed.com,,2025-01-10T20:40:31Z,[Removed],,[Removed]
3,,[Removed],[Removed],https://removed.com,,2025-01-10T13:45:14Z,[Removed],,[Removed]
4,,[Removed],[Removed],https://removed.com,,2025-01-10T17:15:58Z,[Removed],,[Removed]


## Save in database

In [13]:
# SQL Server connection details
server = credentials['server']
database = credentials['database']
username = credentials['username']
password = credentials['password']

In [14]:
conn_str = f'mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+18+for+SQL+Server'

In [15]:
# Connect to database
engine = create_engine(conn_str)

In [18]:
# Append new entries to news, drop duplicates
existing_df = pd.read_sql_query('SELECT * FROM news', engine)
merged_df = pd.concat([existing_df, news_df]).drop_duplicates()
merged_df = merged_df[merged_df['title'] != '[Removed]']
merged_df.to_sql('news', con=engine, if_exists='replace', index=False)

217

## Read from database

In [19]:
# Read from the database and turn into dataframe
news_df = pd.read_sql_query('select * from "news"',con=engine)
sources_df = pd.read_sql_query('select * from "sources"',con=engine)

In [20]:
news_df.head()

Unnamed: 0,author,title,description,url,urlToImage,publishedAt,content,source.id,source.name
0,,Openlayer (YC S21) is looking for a customer e...,About us\nOpenlayer is solving the AI reliabil...,https://www.ycombinator.com/companies/openlaye...,https://www.ycombinator.com/images/original/mi...,2024-12-19T21:00:59Z,About us\r\nOpenlayer is solving the AI reliab...,hacker-news,Hacker News
1,,Ocular AI (YC W24) Is Hiring,Ocular AI\nOcular AI is the data annotation en...,https://www.ycombinator.com/companies/ocular-a...,https://www.ycombinator.com/images/original/mi...,2024-12-26T17:33:54Z,Ocular AI\r\nOcular AI is the data annotation ...,hacker-news,Hacker News
2,cothrun,Decoding the telephony signals in Pink Floyd's...,,https://news.ycombinator.com/item?id=42485795,,2024-12-24T05:47:01Z,(Author here) No contradiction.I think what ha...,hacker-news,Hacker News
3,pmigdal,Show HN: I made a website to semantically sear...,,https://news.ycombinator.com/item?id=42507116,,2024-12-26T05:47:01Z,Thank you for the appreciation and great feedb...,hacker-news,Hacker News
4,lilulo,Build a Low-Cost Drone Using ESP32 | Hacker News,,https://news.ycombinator.com/item?id=42498648,,2024-12-25T05:46:58Z,You can start for ~$400 - I just bought parts ...,hacker-news,Hacker News


In [21]:
sources_df.head()

Unnamed: 0,id,name,description,url,category,language,country
0,ars-technica,Ars Technica,The PC enthusiast's resource. Power users and ...,https://arstechnica.com,technology,en,us
1,crypto-coins-news,Crypto Coins News,Providing breaking cryptocurrency news - focus...,https://www.ccn.com,technology,en,us
2,engadget,Engadget,Engadget is a web magazine with obsessive dail...,https://www.engadget.com,technology,en,us
3,hacker-news,Hacker News,Hacker News is a social news website focusing ...,https://news.ycombinator.com,technology,en,us
4,recode,Recode,"Get the latest independent tech news, reviews ...",http://www.recode.net,technology,en,us


In [22]:
news_df.info()
# Last row count: 2547

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2547 entries, 0 to 2546
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   author       2531 non-null   object
 1   title        2547 non-null   object
 2   description  2510 non-null   object
 3   url          2547 non-null   object
 4   urlToImage   2432 non-null   object
 5   publishedAt  2547 non-null   object
 6   content      2547 non-null   object
 7   source.id    2547 non-null   object
 8   source.name  2547 non-null   object
dtypes: object(9)
memory usage: 179.2+ KB
