In [None]:
# Descriptive Business Question: What are the top 10 most frequently occurring article titles across all scrapes?

In [None]:

import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

# Load env variables
load_dotenv()

PG_USER = os.getenv("PG_USER")
PG_PASSWORD = os.getenv("PG_PASSWORD")
PG_HOST = os.getenv("PG_HOST")
PG_PORT = os.getenv("PG_PORT")
PG_DB = os.getenv("PG_DB")

# Create the engine
engine = create_engine(
    f"postgresql+psycopg2://{PG_USER}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DB}"
)


In [None]:
sql_query = '''
WITH title_counts AS (
    SELECT
        LOWER(TRIM(title)) AS cleaned_title,
        COUNT(*) AS frequency,
        MAX(published_at) AS most_recent
    FROM sql_project.techcrunch_articles
    GROUP BY LOWER(TRIM(title))
),
ranked_titles AS (
    SELECT *,
           RANK() OVER (ORDER BY frequency DESC) AS freq_rank
    FROM title_counts
)
SELECT *
FROM ranked_titles
ORDER BY freq_rank
LIMIT 10;
'''

df = pd.read_sql(sql_query, engine)
pd.set_option('display.max_rows', None)
df




In [None]:
# Diagnostic Business Question - Are the most frequently repeated article titles being published around the same time, possibly linked to specific news cycles or events?

In [4]:
sql_query = '''
WITH title_counts AS (
    SELECT
        LOWER(TRIM(title)) AS cleaned_title,
        COUNT(*) AS frequency
    FROM sql_project.techcrunch_articles
    GROUP BY LOWER(TRIM(title))
    HAVING COUNT(*) > 1
),
repeated_articles AS (
    SELECT
        a.title,
        a.published_at,
        tc.frequency,
        DENSE_RANK() OVER (PARTITION BY LOWER(TRIM(a.title)) ORDER BY a.published_at) AS repeat_order
    FROM sql_project.techcrunch_articles a
    JOIN title_counts tc
        ON LOWER(TRIM(a.title)) = tc.cleaned_title
)
SELECT
    title,
    published_at::date AS publish_day,
    frequency,
    repeat_order
FROM repeated_articles
ORDER BY title, publish_day;
'''

df = pd.read_sql(sql_query, engine)
pd.set_option('display.max_rows', None)
df

Unnamed: 0,title,publish_day,frequency,repeat_order


In [None]:
# Descriptive --

#Insight: Repeated article titles suggest TechCrunch frequently updates or republishes key stories over time.

#Recommendation: Monitor these titles to track evolving narratives and editorial priorities.

#Prediction: These articles will likely resurface during major tech events, signaling trend continuity.

In [None]:
# Diagnotic --

#Insight:Repeated titles cluster around similar publish dates, suggesting editorial republishing cycles or coverage of ongoing stories.

#Recommendation:Monitor these repeat patterns to anticipate when certain stories may resurface and prepare related content or responses.

#Prediction:Repeated article topics will likely reappear during future high-traffic tech events (e.g., major product launches, IPOs).
