In [1]:
import json
import time
import re
import requests
import pandas as pd
from newspaper import Article, Config
from datetime import datetime, date, timedelta

In [2]:
def get_query_dates():
    start_date = date(2021, 1, 1)
    today = date(2024, 2, 16)
    
    dates = []
    
    while start_date <= today:
        dates.append(start_date)
        start_date += timedelta(days=1)

    return dates

def format_date(date):
    dt_object = datetime.fromisoformat(date)
    return dt_object.strftime("%Y-%m-%d %H:%M:%S")

def format_categories(categories):
    categories_list = [category['name'] for category in categories]
    return ', '.join(categories_list)

def clean_title(title):
    return title.removeprefix("Live updates |").strip()

def clean_text(text):
    return text.replace("_","")

def clean_author(raw_author_string):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    url_pattern = r"(?i)\b(?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])\b"
    num_pattern = r'\d'

    if any(re.search(pattern, raw_author_string) for pattern in [email_pattern, url_pattern, num_pattern]):
        return ""
    else:
        author = raw_author_string.lower().replace("by ", "")
        
        if " and " in author or " & " in author:
            author_list = re.split(r'\s+and\s+|\s+&\s+', author)
            author = ", ".join(author_list)
        elif "|" in author:
            author = author.split("|")[0]
            
        return author.title()

In [3]:
day_count = 0
print("Start Time =", datetime.now().strftime("%H:%M:%S"))
print("\n")

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'

config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10

output_df = pd.DataFrame()
dates = get_query_dates()

for queryDate in dates:
    try:
        endpoint = f"https://api.goperigon.com/v1/all?apiKey=758de80b-5160-40f2-83cd-152f116af9e5&from={queryDate}&to={queryDate}&sourceGroup=top100&paywall=false&excludeLabel=Non-news&excludeLabel=Opinion&excludeLabel=Roundup&sortBy=relevance&language=en&medium=Article&page=0&size=100"
        response = requests.get(endpoint)
        parsed_json = response.json()
        raw_df = pd.json_normalize(parsed_json['articles'])
    except Exception as e:
        print(f"Unable to retrieve articles for {queryDate}. Reason: {e}")
        continue
    
    for index, row in raw_df.iterrows():
        url = row.get('url','')
        title = row.get('title','')
        text = row.get('content', '')
        imageURL = row.get('imageUrl', '')
        publication = row.get('source.domain', '')
        positive_score = row.get('sentiment.positive', '')
        negative_score = row.get('sentiment.negative', '')
        neutral_score = row.get('sentiment.neutral', '')
        
        date = format_date(row['pubDate'])
        author = clean_author(row['authorsByline'])
        section = format_categories(row['categories'])
        
        try:
            article = Article(url=url,config=config)
            article.download()
            article.parse()

            if len(article.title) > len(title):
                title = clean_title(article.title)

            if len(text) == 0:
                text = clean_text(article.text)

            if len(title) == 0 or len(text) == 0:
                continue

            column_data = pd.DataFrame([{ 
                        'date' : date, 
                        'author' : author, 
                        'title' : title , 
                        'article' : text, 
                        'url' : url, 
                        'section' : section, 
                        'publication' : publication, 
                        'imageURL' : imageURL, 
                        'positive_score' : positive_score,
                        'negative_score' : negative_score,
                        'neutral_score' : neutral_score}])
    
            output_df = pd.concat([output_df, column_data])
        except Exception as e:
            continue

    day_count += 1
    print(f"Retrieved articles for {queryDate}. Current output size: {len(output_df)}. Success day count: {day_count}")
    
output_df.head()
output_df.to_csv("offline-data-retrieved.csv", index=False)

print("End Time =", datetime.now().strftime("%H:%M:%S"))
print("\n")

Start Time = 02:02:15


Retrieved articles for 2021-01-01. Current output size: 60. Success day count: 1
Retrieved articles for 2021-01-02. Current output size: 119. Success day count: 2
Retrieved articles for 2021-01-03. Current output size: 190. Success day count: 3
Retrieved articles for 2021-01-04. Current output size: 249. Success day count: 4
Retrieved articles for 2021-01-05. Current output size: 309. Success day count: 5
Retrieved articles for 2021-01-06. Current output size: 345. Success day count: 6
Retrieved articles for 2021-01-07. Current output size: 400. Success day count: 7
Retrieved articles for 2021-01-08. Current output size: 461. Success day count: 8
Retrieved articles for 2021-01-09. Current output size: 528. Success day count: 9
Retrieved articles for 2021-01-10. Current output size: 587. Success day count: 10
Retrieved articles for 2021-01-11. Current output size: 648. Success day count: 11
Retrieved articles for 2021-01-12. Current output size: 696. Success day 