In [3]:
import os
import pandas as pd
from sqlalchemy import create_engine, text

# Load credentials securely
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")

# Connect to Postgres
engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")

# See all rows if needed
pd.set_option('display.max_rows', None)

### Business Question:
Which Snap job categories offer both high salaries and high public engagement on Reddit?

Combining job postings and public sentiment helps identify roles where Snap is investing the most money and generating the most conversation.

In [4]:
from sqlalchemy import text
import pandas as pd

query = text("""
-- Extract salaries from raw strings
WITH extracted_salaries AS (
    SELECT *,
        CAST(REGEXP_REPLACE(SPLIT_PART(salary, '-', 1), '[^0-9]', '', 'g') AS INT) AS min_salary,
        CAST(REGEXP_REPLACE(SPLIT_PART(salary, '-', 2), '[^0-9]', '', 'g') AS INT) AS max_salary
    FROM fact_indeed_jobs
    WHERE salary IS NOT NULL AND salary != ''
),

-- Categorize job titles into broad focus areas
job_tagged AS (
    SELECT *,
        CASE
            WHEN LOWER(job_title) LIKE '%machine learning%' OR LOWER(job_title) LIKE '%ai%' THEN 'AI roles'
            WHEN LOWER(job_title) LIKE '%marketing%' THEN 'marketing'
            ELSE 'other'
        END AS tag
    FROM extracted_salaries
),

-- Aggregate salary stats by tag
job_stats AS (
    SELECT
        tag,
        COUNT(*) AS job_count,
        ROUND(AVG(max_salary)) AS avg_max_salary
    FROM job_tagged
    GROUP BY tag
),

-- Aggregate Reddit sentiment by tag
reddit_tagged AS (
    SELECT 
        UNNEST(STRING_TO_ARRAY(topic_tag, ',')) AS tag,
        AVG(score) AS avg_score,
        AVG(num_comments) AS avg_comments,
        COUNT(*) AS post_count
    FROM reddit_posts
    WHERE topic_tag IS NOT NULL
    GROUP BY tag
),

-- Join both sources
joined_view AS (
    SELECT 
        j.tag,
        j.job_count,
        j.avg_max_salary,
        r.post_count,
        r.avg_score,
        r.avg_comments,
        ROUND(COALESCE(j.avg_max_salary, 0) * COALESCE(r.post_count, 0) * COALESCE(r.avg_score, 0) / 1000.0, 1) AS sentiment_heat_index,
        RANK() OVER (ORDER BY COALESCE(j.avg_max_salary, 0) * COALESCE(r.post_count, 0) * COALESCE(r.avg_score, 0) DESC) AS heat_rank
    FROM job_stats j
    LEFT JOIN reddit_tagged r ON LOWER(j.tag) = LOWER(r.tag)
)

SELECT * FROM joined_view
ORDER BY heat_rank;
""")

df_realign = pd.read_sql(query, engine)
pd.set_option("display.max_rows", None)
df_realign


Unnamed: 0,tag,job_count,avg_max_salary,post_count,avg_score,avg_comments,sentiment_heat_index,heat_rank
0,AI roles,7,323143.0,500.0,599.184,115.428,96811057.7,1
1,other,2,264500.0,156.0,308.416667,129.967949,12725888.5,2
2,marketing,1,235000.0,,,,0.0,3


**Insight:**  
Snap’s highest-paying job category — “AI roles” with an average max salary of $323K — also dominates Reddit engagement. These roles received 500+ Reddit posts with an average score of 599 and over 115 comments per post. This shows clear public attention aligned with Snap’s investment priorities. Marketing roles, though still funded well, show no real online traction.

**Recommendation:**  
Snap should continue focusing hiring and public communication around its AI initiatives. The strong overlap between salary and public buzz suggests these roles are high-profile. However, for underrepresented areas like marketing, Snap might need to re-evaluate messaging, job visibility, or consider cross-promoting open roles on platforms where tech workers gather.

**Prediction:**  
If Snap keeps prioritizing AI/ML roles, online visibility and reputation in this space will remain strong. But failure to build similar visibility for other departments could lead to hiring slowdowns or perception gaps, especially for roles that lack buzz or aren’t talked about in key online communities.

----------------------------------------------------------------------------------------------------

### Business Question:
How does job seniority level relate to salary competitiveness and spread at Snap?

Understanding how salaries change across levels like "Senior", "Staff", and "Principal" helps gauge Snap’s investment in leadership.

In [2]:
from sqlalchemy import text
import pandas as pd

query = text("""
-- Step 1: Extract min/max salary from raw strings
WITH cleaned_salaries AS (
    SELECT *,
        CAST(REGEXP_REPLACE(SPLIT_PART(salary, '-', 1), '[^0-9]', '', 'g') AS INT) AS min_salary,
        CAST(REGEXP_REPLACE(SPLIT_PART(salary, '-', 2), '[^0-9]', '', 'g') AS INT) AS max_salary
    FROM fact_indeed_jobs
    WHERE salary IS NOT NULL AND salary != ''
),

-- Step 2: Assign seniority levels
labeled_roles AS (
    SELECT *,
        CASE
            WHEN LOWER(job_title) LIKE '%principal%' THEN 'Principal'
            WHEN LOWER(job_title) LIKE '%staff%' THEN 'Staff'
            WHEN LOWER(job_title) LIKE '%senior%' THEN 'Senior'
            ELSE 'Other'
        END AS seniority
    FROM cleaned_salaries
),

-- Step 3: Aggregate salary stats by seniority
seniority_aggregates AS (
    SELECT
        seniority,
        COUNT(*) AS role_count,
        ROUND(AVG(min_salary)) AS avg_min_salary,
        ROUND(AVG(max_salary)) AS avg_max_salary,
        ROUND(AVG(max_salary - min_salary)) AS avg_spread,
        MAX(max_salary - min_salary) AS max_spread,
        MIN(max_salary - min_salary) AS min_spread
    FROM labeled_roles
    GROUP BY seniority
),

-- Step 4: Use window function to compare avg max salary growth between levels
ranked_salary AS (
    SELECT *,
        RANK() OVER (ORDER BY avg_max_salary DESC) AS salary_rank,
        LAG(avg_max_salary) OVER (ORDER BY avg_max_salary) AS prev_avg_max_salary
    FROM seniority_aggregates
)

SELECT 
    seniority,
    role_count,
    avg_min_salary,
    avg_max_salary,
    avg_spread,
    max_spread,
    min_spread,
    ROUND(
        100.0 * (avg_max_salary - prev_avg_max_salary) / NULLIF(prev_avg_max_salary, 0),
        1
    ) AS percent_increase,
    salary_rank
FROM ranked_salary
ORDER BY salary_rank;
""")

df_q1 = pd.read_sql(query, engine)
pd.set_option("display.max_rows", None)
df_q1


Unnamed: 0,seniority,role_count,avg_min_salary,avg_max_salary,avg_spread,max_spread,min_spread,percent_increase,salary_rank
0,Principal,2,235000.0,414000.0,179000.0,179000,179000,20.7,1
1,Staff,2,195000.0,343000.0,148000.0,148000,148000,27.0,2
2,Senior,1,162000.0,270000.0,108000.0,108000,108000,8.7,3
3,Other,5,141000.0,248400.0,107400.0,135000,76000,,4


**Insight:**  
Principal-level roles at Snap average a $414K max salary with a $179K spread which is the widest and highest of all levels. Staff roles follow at $343K with a $148K spread. Senior roles average $270K, with a 27% jump to Staff and another 21% jump to Principal. Understanding how salaries shift across levels reveals Snap’s investment in leadership — wider spreads may reflect flexibility, but also signal unclear role scope.

**Recommendation:**  
Snap should clarify expectations and compensation structure at each seniority level. Transparency on salary bands and role responsibilities will reduce candidate uncertainty and build trust during negotiations.

**Prediction:**  
Without clearer benchmarks, senior candidates may second-guess offer value or role clarity. As Snap scales its AI/ML efforts, structured compensation communication will be essential to attract and close top-tier leadership.
