In [8]:
#Business Question: What are the top 10 most frequently occurring article titles across all scrapes?

In [2]:

import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

# Load env variables
load_dotenv()

PG_USER = os.getenv("PG_USER")
PG_PASSWORD = os.getenv("PG_PASSWORD")
PG_HOST = os.getenv("PG_HOST")
PG_PORT = os.getenv("PG_PORT")
PG_DB = os.getenv("PG_DB")

# Create the engine
engine = create_engine(
    f"postgresql+psycopg2://{PG_USER}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DB}"
)


In [3]:
sql_query = '''
WITH title_counts AS (
    SELECT
        LOWER(TRIM(title)) AS cleaned_title,
        COUNT(*) AS frequency,
        MAX(published_at) AS most_recent
    FROM sql_project.techcrunch_articles
    GROUP BY LOWER(TRIM(title))
),
ranked_titles AS (
    SELECT *,
           RANK() OVER (ORDER BY frequency DESC) AS freq_rank
    FROM title_counts
)
SELECT *
FROM ranked_titles
ORDER BY freq_rank
LIMIT 10;
'''

df = pd.read_sql(sql_query, engine)
pd.set_option('display.max_rows', None)
df

# Export to CSV
df.to_csv('reports/techcrunch_top_titles.csv', index=False)  # For TechCrunch


In [15]:
#Insight: Repeated article titles suggest TechCrunch frequently updates or republishes key stories over time.
#Recommendation: Monitor these titles to track evolving narratives and editorial priorities.
#Prediction: These articles will likely resurface during major tech events, signaling trend continuity.