In [7]:
# Business Problem: From the Rapid API source, we want to understand what keywords or domains are most frequently scraped, and whether there are repeat content patterns (e.g., from specific authors, tags, or days). This can help prioritize monitoring for high-frequency sources or trending topics.

import os
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
from dotenv import load_dotenv
load_dotenv()  # This must come before os.getenv calls

db_user = os.getenv("PG_USER")
db_password = os.getenv("PG_PASSWORD")
db_host = os.getenv("PG_HOST")
db_port = os.getenv("PG_PORT")
db_name = os.getenv("PG_DB")


# Create connection
engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")



In [10]:
# Descriptive 

# Business Question: What are the most frequent article sources and how many articles does each source contribute?

sql_query = '''
SELECT keyword, COUNT(*) AS frequency
FROM sql_project.api_keywords
GROUP BY keyword
ORDER BY frequency DESC
LIMIT 20;
'''


df = pd.read_sql(sql_query, engine)
df



ProgrammingError: (psycopg2.errors.UndefinedColumn) column "keyword" does not exist
LINE 2: SELECT keyword, COUNT(*) AS frequency
               ^

[SQL: 
SELECT keyword, COUNT(*) AS frequency
FROM sql_project.api_keywords
GROUP BY keyword
ORDER BY frequency DESC
LIMIT 20;
]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
# Diagnostic

#Business Question: Is there a pattern in how often each source posts over time?


sql_query = '''
WITH daily_counts AS (
    SELECT
        source,
        DATE(scraped_at) AS scrape_date,
        COUNT(*) AS daily_articles
    FROM sql_project.techcrunch_articles
    GROUP BY source, DATE(scraped_at)
),
ranked AS (
    SELECT *,
        RANK() OVER (PARTITION BY source ORDER BY scrape_date DESC) AS recency_rank
    FROM daily_counts
)
SELECT *
FROM ranked
WHERE recency_rank <= 5;
'''
df = pd.read_sql(sql_query, engine)
pd.set_option('display.max_rows', None)
df
