In [1]:
from jobspy import scrape_jobs
import pandas as pd
from datetime import datetime

In [2]:
print("Starting California Business Analyst scrape...")
print(f"Time: {datetime.now()}")

ca_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="business analyst",
    location="California",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

ca_jobs['state'] = 'California'
ca_jobs.to_csv('../data/raw/california_business_analyst_jobs.csv', index=False)
print(f"\n California: {len(ca_jobs)} business analyst jobs scraped")
print(f"Jobs by site:\n{ca_jobs['site'].value_counts()}")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting California Business Analyst scrape...
Time: 2025-11-30 12:10:34.130546

 California: 500 business analyst jobs scraped
Jobs by site:
site
indeed    500
Name: count, dtype: int64
Completed at: 2025-11-30 12:10:38.750905
------------------------------------------------------------


In [3]:
print("Starting New York Business Analyst scrape...")
print(f"Time: {datetime.now()}")

ny_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="business analyst",
    location="New York",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

ny_jobs['state'] = 'New York'
ny_jobs.to_csv('../data/raw/newyork_business_analyst_jobs.csv', index=False)
print(f"\n New York: {len(ny_jobs)} business analyst jobs scraped")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting New York Business Analyst scrape...
Time: 2025-11-30 12:10:41.123272

 New York: 500 business analyst jobs scraped
Completed at: 2025-11-30 12:10:49.493667
------------------------------------------------------------


In [4]:
print("Starting Texas Business Analyst scrape...")
print(f"Time: {datetime.now()}")

tx_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="business analyst",
    location="Texas",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

tx_jobs['state'] = 'Texas'
tx_jobs.to_csv('../data/raw/texas_business_analyst_jobs.csv', index=False)
print(f"\n Texas: {len(tx_jobs)} business analyst jobs scraped")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting Texas Business Analyst scrape...
Time: 2025-11-30 12:10:53.379180

 Texas: 500 business analyst jobs scraped
Completed at: 2025-11-30 12:10:57.729552
------------------------------------------------------------


In [5]:
print("Combining all state data for Business Analyst positions...")

ca_jobs = pd.read_csv('../data/raw/california_business_analyst_jobs.csv')
ny_jobs = pd.read_csv('../data/raw/newyork_business_analyst_jobs.csv')
tx_jobs = pd.read_csv('../data/raw/texas_business_analyst_jobs.csv')

all_jobs = pd.concat([ca_jobs, ny_jobs, tx_jobs], ignore_index=True)
all_jobs.to_csv('../data/raw/all_states_business_analyst_jobs.csv', index=False)

print(f"\n Combined data saved!")
print(f"Total Business Analyst jobs: {len(all_jobs)}")
print(f"  - California: {len(ca_jobs)}")
print(f"  - New York: {len(ny_jobs)}")
print(f"  - Texas: {len(tx_jobs)}")
print("\n" + "=" * 60)

Combining all state data for Business Analyst positions...

 Combined data saved!
Total Business Analyst jobs: 1500
  - California: 500
  - New York: 500
  - Texas: 500



In [6]:
print("Creating Excel file with separate sheets for Business Analyst jobs...")

columns_to_keep = [
    'title', 'company', 'location', 'min_amount', 'max_amount', 'currency',
    'interval', 'salary_source', 'date_posted', 'job_type', 'is_remote', 
    'job_url', 'description'
]

try:
    import openpyxl
except ImportError:
    import subprocess
    import sys
    print("Installing openpyxl...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openpyxl", "--break-system-packages"])
    print(" openpyxl installed!")

with pd.ExcelWriter('../data/raw/business_analyst_jobs_by_state.xlsx', engine='openpyxl') as writer:
    ca_jobs[columns_to_keep].to_excel(writer, sheet_name='California', index=False)
    ny_jobs[columns_to_keep].to_excel(writer, sheet_name='New York', index=False)
    tx_jobs[columns_to_keep].to_excel(writer, sheet_name='Texas', index=False)
    all_with_state = all_jobs[columns_to_keep + ['state']].copy()
    all_with_state.to_excel(writer, sheet_name='All States', index=False)

print("\n Created Excel file: business_analyst_jobs_by_state.xlsx")
print("   - California: {} jobs".format(len(ca_jobs)))
print("   - New York: {} jobs".format(len(ny_jobs)))
print("   - Texas: {} jobs".format(len(tx_jobs)))
print("   - All States: {} jobs".format(len(all_jobs)))

Creating Excel file with separate sheets for Business Analyst jobs...

 Created Excel file: business_analyst_jobs_by_state.xlsx
   - California: 500 jobs
   - New York: 500 jobs
   - Texas: 500 jobs
   - All States: 1500 jobs
