In [2]:
import requests
import time
import json
import logging
import os
from bs4 import BeautifulSoup
from datetime import datetime, timezone


In [None]:
quotes_url = "https://quotes.toscrape.com"
github_api = "https://api.github.com/search/repositories"
user_agent = "kevin-atna/1.0" # this is to identify the script

- The assignment requires a configurable User-Agent header, so I added one to clearly identify the script and avoid request blocking.

In [None]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("atna.log")


#### Logging Setup

logging for the script.

- It configures logging to show informational messages and above -- Is used instead of `print()`
- Each log message includes the time, log level, and message
- logger is used to track what the program is doing
- Helps to understand the program flow and debug issues easily


In [5]:
def get_with_retry(url, headers=None, params=None, retries=3):
    wait = 1
    for i in range(retries):
        try:
            r = requests.get(url, headers=headers, params=params, timeout=10)
            if r.status_code >= 500:
                raise Exception(f"Server error {r.status_code}")
            return r
        except Exception as e:
            logger.warning(f"Retry {i+1}/{retries} failed: {e}")
            if i == retries - 1:
                raise
            time.sleep(wait)
            wait *= 2


#### get_with_retry function is used to safely make an HTTP GET request.

- It tries to call the given URL
- If the request fails due to a network issue or server error, it retries
- It retries up to 3 times by default
- Between retries, it waits for some time and increases the wait each time
- This handles temporary issues like slow network or server downtime
- If all retries fail, the function shows an error
- If succeeds, it returns the response

In [None]:

def scrape_quotes():
    quotes = []
    url = quotes_url
    headers = {"User-Agent": user_agent}
    while url: #Starting a loop that runs as long as there is a valid page URL.
        r = get_with_retry(url, headers=headers)
        soup = BeautifulSoup(r.text, "html.parser") # fetching the html content of the page.
        for q in soup.select(".quote"): # Scraping the quote text, author, and author URL.
            quotes.append({
                "text": q.select_one(".text").get_text(strip=True),
                "author": q.select_one(".author").get_text(strip=True),
                "author_url": quotes_url + q.select_one("a")["href"]
            })
        next_btn = soup.select_one("li.next > a") # selecting the next button.
        url = quotes_url + next_btn["href"] if next_btn else None
    return quotes 


In [None]:
def fetch_github_repos():
    logger.info("Fetching GitHub repo")
    repos = []

    headers = {
        "User-Agent": user_agent,
        "Accept": "application/vnd.github.v3+json" #requesting for a json response
    }

    params = {
        "q": "language:python", # searches for python repositories
        "sort": "stars",
        "order": "desc",
        "per_page": 30,   # setting up safe limit to avoid rate limiting
        "page": 1
    }

    try:
        resp = get_with_retry(github_api, headers=headers, params=params)
        data = resp.json()
    except Exception:
        logger.error("GitHub API request failed")
        return repos

    items = data.get("items", [])
    if not items:
        logger.warning("No GitHub repositories returned (rate limit)")
        return repos

    for repo in items:
        repos.append({
            "name": repo.get("name"),
            "owner": repo.get("owner", {}).get("login"),
            "stars": repo.get("stargazers_count", 0),
            "url": repo.get("html_url")
        })

    return repos


In [24]:
import time
start = time.time()
quotes = scrape_quotes()
repos = fetch_github_repos()

output = {
    "quotes": quotes,
    "github_repos": repos,
    "meta": {
        "run_time": datetime.now(timezone.utc).isoformat(),
        "total_quotes": len(quotes),
        "total_repos": len(repos),
        "failures": 0
    }
}

len(quotes), len(repos)


2026-02-10 12:51:24,932 | INFO | Fetching GitHub repo


(100, 30)

In [25]:
output

{'quotes': [{'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
   'author': 'Albert Einstein',
   'author_url': 'https://quotes.toscrape.com/author/Albert-Einstein'},
  {'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
   'author': 'J.K. Rowling',
   'author_url': 'https://quotes.toscrape.com/author/J-K-Rowling'},
  {'text': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
   'author': 'Albert Einstein',
   'author_url': 'https://quotes.toscrape.com/author/Albert-Einstein'},
  {'text': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
   'author': 'Jane Austen',
   'author_url': 'https://quotes.toscrape.com/author/Jane-Austen'},
  {'text': "“Imperfection is beauty, madness is genius and it's better to be absolutely ridicu

In [27]:
with open("final_output.json", "w") as f:
    json.dump(output, f, indent=2)

print("Saved test_output.json")


Saved test_output.json
