In [8]:
#Business Question: What are the top 10 most frequently occurring article titles across all scrapes?

In [13]:

import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

# Load env variables
load_dotenv()

PG_USER = os.getenv("PG_USER")
PG_PASSWORD = os.getenv("PG_PASSWORD")
PG_HOST = os.getenv("PG_HOST")
PG_PORT = os.getenv("PG_PORT")
PG_DB = os.getenv("PG_DB")

# Create the engine
engine = create_engine(
    f"postgresql+psycopg2://{PG_USER}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DB}"
)


In [14]:
sql_query = '''
WITH title_counts AS (
    SELECT
        LOWER(TRIM(title)) AS cleaned_title,
        COUNT(*) AS frequency,
        MAX(published_at) AS most_recent
    FROM sql_project.techcrunch_articles
    GROUP BY LOWER(TRIM(title))
),
ranked_titles AS (
    SELECT *,
           RANK() OVER (ORDER BY frequency DESC) AS freq_rank
    FROM title_counts
)
SELECT *
FROM ranked_titles
ORDER BY freq_rank
LIMIT 10;
'''

df = pd.read_sql(sql_query, engine)
pd.set_option('display.max_rows', None)
df


Unnamed: 0,cleaned_title,frequency,most_recent,freq_rank
0,skype shuts down after 23 years,1,NaT,1
1,epic games and spotify test apple’s new app st...,1,2025-05-09,1
2,microsoft employees are banned from using deep...,1,2025-05-08,1
3,hugging face releases a free operator-like age...,1,2025-05-06,1
4,"ai has opened a new era in venture capital, ac...",1,NaT,1
5,one of elon musk’s longtime vcs is suing his f...,1,2025-05-08,1
6,is duolingo the face of an ai jobs crisis?,1,2025-05-04,1
7,"alphabet earnings live updates: ai, gemini 2.0...",1,NaT,1
8,the department of labor just dropped its inves...,1,2025-05-09,1
9,"google i/o 2025: what to expect, including upd...",1,2025-05-09,1


In [15]:
#Insight: Repeated article titles suggest TechCrunch frequently updates or republishes key stories over time.
#Recommendation: Monitor these titles to track evolving narratives and editorial priorities.
#Prediction: These articles will likely resurface during major tech events, signaling trend continuity.