In [1]:
from jobspy import scrape_jobs
import pandas as pd
from datetime import datetime

In [2]:
print("Starting California Data Analyst scrape...")
print(f"Time: {datetime.now()}")

ca_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="data analyst",
    location="California",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

ca_jobs['state'] = 'California'
ca_jobs.to_csv('../data/raw/california_data_analyst_jobs.csv', index=False)
print(f"\n California: {len(ca_jobs)} data analyst jobs scraped")
print(f"Jobs by site:\n{ca_jobs['site'].value_counts()}")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting California Data Analyst scrape...
Time: 2025-11-30 12:00:23.344339

 California: 500 data analyst jobs scraped
Jobs by site:
site
indeed    500
Name: count, dtype: int64
Completed at: 2025-11-30 12:00:27.410555
------------------------------------------------------------


In [3]:
print("Starting New York Data Analyst scrape...")
print(f"Time: {datetime.now()}")

ny_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="data analyst",
    location="New York",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

ny_jobs['state'] = 'New York'
ny_jobs.to_csv('../data/raw/newyork_data_analyst_jobs.csv', index=False)
print(f"\n New York: {len(ny_jobs)} data analyst jobs scraped")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting New York Data Analyst scrape...
Time: 2025-11-30 12:00:45.582546

 New York: 500 data analyst jobs scraped
Completed at: 2025-11-30 12:00:48.818092
------------------------------------------------------------


In [4]:
print("Starting Texas Data Analyst scrape...")
print(f"Time: {datetime.now()}")

tx_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="data analyst",
    location="Texas",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

tx_jobs['state'] = 'Texas'
tx_jobs.to_csv('../data/raw/texas_data_analyst_jobs.csv', index=False)
print(f"\n Texas: {len(tx_jobs)} data analyst jobs scraped")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting Texas Data Analyst scrape...
Time: 2025-11-30 12:00:56.549839

 Texas: 500 data analyst jobs scraped
Completed at: 2025-11-30 12:01:00.363047
------------------------------------------------------------


In [5]:
print("Combining all state data for Data Analyst positions...")

ca_jobs = pd.read_csv('../data/raw/california_data_analyst_jobs.csv')
ny_jobs = pd.read_csv('../data/raw/newyork_data_analyst_jobs.csv')
tx_jobs = pd.read_csv('../data/raw/texas_data_analyst_jobs.csv')

all_jobs = pd.concat([ca_jobs, ny_jobs, tx_jobs], ignore_index=True)
all_jobs.to_csv('../data/raw/all_states_data_analyst_jobs.csv', index=False)

print(f"\n Combined data saved!")
print(f"Total Data Analyst jobs: {len(all_jobs)}")
print(f"  - California: {len(ca_jobs)}")
print(f"  - New York: {len(ny_jobs)}")
print(f"  - Texas: {len(tx_jobs)}")
print("\n" + "=" * 60)

Combining all state data for Data Analyst positions...

 Combined data saved!
Total Data Analyst jobs: 1500
  - California: 500
  - New York: 500
  - Texas: 500



In [6]:
print("Creating Excel file with separate sheets for Data Analyst jobs...")

columns_to_keep = [
    'title', 'company', 'location', 'min_amount', 'max_amount', 'currency',
    'interval', 'salary_source', 'date_posted', 'job_type', 'is_remote', 
    'job_url', 'description'
]

try:
    import openpyxl
except ImportError:
    import subprocess
    import sys
    print("Installing openpyxl...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openpyxl", "--break-system-packages"])
    print(" openpyxl installed!")

with pd.ExcelWriter('../data/raw/data_analyst_jobs_by_state.xlsx', engine='openpyxl') as writer:
    ca_jobs[columns_to_keep].to_excel(writer, sheet_name='California', index=False)
    ny_jobs[columns_to_keep].to_excel(writer, sheet_name='New York', index=False)
    tx_jobs[columns_to_keep].to_excel(writer, sheet_name='Texas', index=False)
    all_with_state = all_jobs[columns_to_keep + ['state']].copy()
    all_with_state.to_excel(writer, sheet_name='All States', index=False)

print("\n Created Excel file: data_analyst_jobs_by_state.xlsx")
print("   - California: {} jobs".format(len(ca_jobs)))
print("   - New York: {} jobs".format(len(ny_jobs)))
print("   - Texas: {} jobs".format(len(tx_jobs)))
print("   - All States: {} jobs".format(len(all_jobs)))

Creating Excel file with separate sheets for Data Analyst jobs...

 Created Excel file: data_analyst_jobs_by_state.xlsx
   - California: 500 jobs
   - New York: 500 jobs
   - Texas: 500 jobs
   - All States: 1500 jobs


In [7]:
print("Creating summary report for Data Analyst positions...")

summary = {
    'State': ['California', 'New York', 'Texas', 'TOTAL'],
    'Total Jobs': [len(ca_jobs), len(ny_jobs), len(tx_jobs), len(all_jobs)],
    'Jobs with Salary': [
        ca_jobs['min_amount'].notna().sum(),
        ny_jobs['min_amount'].notna().sum(),
        tx_jobs['min_amount'].notna().sum(),
        all_jobs['min_amount'].notna().sum()
    ],
    'Avg Min Salary': [
        ca_jobs['min_amount'].mean(),
        ny_jobs['min_amount'].mean(),
        tx_jobs['min_amount'].mean(),
        all_jobs['min_amount'].mean()
    ],
    'Avg Max Salary': [
        ca_jobs['max_amount'].mean(),
        ny_jobs['max_amount'].mean(),
        tx_jobs['max_amount'].mean(),
        all_jobs['max_amount'].mean()
    ],
    'Remote Jobs': [
        ca_jobs['is_remote'].sum(),
        ny_jobs['is_remote'].sum(),
        tx_jobs['is_remote'].sum(),
        all_jobs['is_remote'].sum()
    ]
}

summary_df = pd.DataFrame(summary)

for col in ['Avg Min Salary', 'Avg Max Salary']:
    summary_df[col] = summary_df[col].apply(
        lambda x: f"${x:,.0f}" if pd.notna(x) else "N/A"
    )

print("\n" + "=" * 80)
print("DATA ANALYST JOB MARKET SUMMARY")
print("=" * 80)
print(summary_df.to_string(index=False))
print("=" * 80)

summary_df.to_csv('../data/raw/data_analyst_summary_report.csv', index=False)
print("\n Summary saved to: data_analyst_summary_report.csv")

Creating summary report for Data Analyst positions...

DATA ANALYST JOB MARKET SUMMARY
     State  Total Jobs  Jobs with Salary Avg Min Salary Avg Max Salary  Remote Jobs
California         500               419        $90,376       $132,188          135
  New York         500               423        $88,381       $121,645          115
     Texas         500               209        $84,414       $119,450          109
     TOTAL        1500              1051        $88,387       $125,412          359

 Summary saved to: data_analyst_summary_report.csv
