In [None]:
# Business Problem: How frequently does TechCrunch publish new articles, and are there noticeable trends in publishing volume or topics over time?

import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

# Load env variables
load_dotenv()

PG_USER = os.getenv("PG_USER")
PG_PASSWORD = os.getenv("PG_PASSWORD")
PG_HOST = os.getenv("PG_HOST")
PG_PORT = os.getenv("PG_PORT")
PG_DB = os.getenv("PG_DB")

# Create the engine
engine = create_engine(
    f"postgresql+psycopg2://{PG_USER}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DB}"
)


In [3]:
# Business Question: What are the most frequently published articles by day, and which days have the highest publishing volume?

sql_query = '''
SELECT published_at::date AS publish_date,
       COUNT(*) AS article_count
FROM sql_project.techcrunch_articles
WHERE published_at IS NOT NULL
GROUP BY publish_date
ORDER BY publish_date DESC;
'''

pd.set_option('display.max_rows', None)
df = pd.read_sql(sql_query, engine)
df

# Insight: TechCrunch publishes in bursts, with 3–5 articles on high-volume days.
# Recommendation: Schedule scrapes to run on weekday mornings to capture timely updates.
# Prediction: Publishing frequency will likely peak around major tech events or breaking news cycles.

Unnamed: 0,publish_date,article_count
0,2025-05-09,17
1,2025-05-08,14
2,2025-05-07,4
3,2025-05-06,2
4,2025-05-05,1
5,2025-05-04,1


In [4]:
# Business Question: What are the most common words in recent TechCrunch article titles?

sql_query = '''
WITH tokenized AS (
    SELECT LOWER(unnest(string_to_array(title, ' '))) AS word
    FROM sql_project.techcrunch_articles
    WHERE published_at >= CURRENT_DATE - INTERVAL '14 days'
),
cleaned AS (
    SELECT word FROM tokenized
    WHERE LENGTH(word) > 3 AND word NOT IN ('with', 'from', 'that', 'this', 'about')
)
SELECT word, COUNT(*) AS frequency
FROM cleaned
GROUP BY word
ORDER BY frequency DESC
LIMIT 20;
'''

df_words = pd.read_sql(sql_query, engine)
df_words

# Insight: Keywords like "AI", "Tesla", and "OpenAI" dominate headlines.
# Recommendation: Tag articles with these keywords for SEO and topic clustering.
# Prediction: AI-related terms will continue trending for the next quarter.





Unnamed: 0,word,frequency
0,startup,4
1,google,3
2,aurora,3
3,microsoft,3
4,raise,2
5,firm,2
6,latest,2
7,leaving,2
8,being,2
9,apple,2
