In [5]:
# Cell 1: imports
from jobspy import scrape_jobs
import pandas as pd
from datetime import datetime


In [8]:
# Cell 2: small helper so we don't repeat code

def scrape_role_state(role_name, state_name, results=400):
    """
    Scrape Glassdoor jobs for one role in one state.
    role_name: "data scientist" or "product manager"
    state_name: "California", "New York", "Texas"
    """
    print(f"\nStarting {role_name} in {state_name} scrape...")
    print(f"Time: {datetime.now()}")

    jobs = scrape_jobs(
        site_name=["indeed"],
        search_term=role_name,
        location=state_name,
        results_wanted=results,
        description_format="html",
        verbose=1
    )

    # add simple labels
    jobs["state"] = state_name
    jobs["role"] = role_name.title()

    print(f"\n{state_name} - {role_name}: {len(jobs)} jobs scraped")
    print(f"Completed at: {datetime.now()}")
    print("-" * 60)

    return jobs

In [9]:
# Cell 3: Data Scientist – California

ds_ca = scrape_role_state("data scientist", "California", results=400)
ds_ca.to_csv("../data/california_data_scientist_jobs.csv", index=False)


Starting data scientist in California scrape...
Time: 2025-11-19 23:22:10.874870

California - data scientist: 400 jobs scraped
Completed at: 2025-11-19 23:22:14.688863
------------------------------------------------------------


In [10]:
# Cell 4: Data Scientist – New York

ds_ny = scrape_role_state("data scientist", "New York", results=400)
ds_ny.to_csv("../data/newyork_data_scientist_jobs.csv", index=False)


Starting data scientist in New York scrape...
Time: 2025-11-19 23:23:25.542597

New York - data scientist: 400 jobs scraped
Completed at: 2025-11-19 23:23:29.029088
------------------------------------------------------------


In [11]:
# Cell 5: Data Scientist – Texas

ds_tx = scrape_role_state("data scientist", "Texas", results=400)
ds_tx.to_csv("../data/texas_data_scientist_jobs.csv", index=False)


Starting data scientist in Texas scrape...
Time: 2025-11-19 23:23:36.059236

Texas - data scientist: 400 jobs scraped
Completed at: 2025-11-19 23:23:38.818030
------------------------------------------------------------


In [12]:
# Cell 6: Product Manager – California

pm_ca = scrape_role_state("product manager", "California", results=400)
pm_ca.to_csv("../data/california_product_manager_jobs.csv", index=False)


Starting product manager in California scrape...
Time: 2025-11-19 23:26:25.450137

California - product manager: 400 jobs scraped
Completed at: 2025-11-19 23:26:34.503222
------------------------------------------------------------


In [13]:
# Cell 7: Product Manager – New York

pm_ny = scrape_role_state("product manager", "New York", results=400)
pm_ny.to_csv("../data/newyork_product_manager_jobs.csv", index=False)


Starting product manager in New York scrape...
Time: 2025-11-19 23:26:41.710690

New York - product manager: 400 jobs scraped
Completed at: 2025-11-19 23:26:44.236380
------------------------------------------------------------


In [14]:
# Cell 8: Product Manager – Texas

pm_tx = scrape_role_state("product manager", "Texas", results=400)
pm_tx.to_csv("../data/texas_product_manager_jobs.csv", index=False)


Starting product manager in Texas scrape...
Time: 2025-11-19 23:26:44.270980

Texas - product manager: 400 jobs scraped
Completed at: 2025-11-19 23:26:47.795853
------------------------------------------------------------


In [15]:
#  Cell 9: combine the 6 groups into one CSV (optional but useful)

print("\nCombining all DS & PM data...")

all_jobs = pd.concat(
    [ds_ca, ds_ny, ds_tx, pm_ca, pm_ny, pm_tx],
    ignore_index=True
)

all_jobs.to_csv("../data/all_states_DS_PM_jobs.csv", index=False)

print(f"Total combined jobs: {len(all_jobs)}")
print("Saved to ../data/all_states_DS_PM_jobs.csv")
print("=" * 60)


Combining all DS & PM data...
Total combined jobs: 2400
Saved to ../data/all_states_DS_PM_jobs.csv
