In [2]:
from jobspy import scrape_jobs
import pandas as pd
from datetime import datetime

In [3]:
print("Starting California Machine Learning Engineer scrape...")
print(f"Time: {datetime.now()}")

ca_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="machine learning engineer",
    location="California",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

ca_jobs['state'] = 'California'
ca_jobs.to_csv('../data/raw/california_machine_learning_engineer_jobs.csv', index=False)
print(f"\n California: {len(ca_jobs)} machine learning engineer jobs scraped")
print(f"Jobs by site:\n{ca_jobs['site'].value_counts()}")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting California Machine Learning Engineer scrape...
Time: 2025-11-30 21:45:51.492701

 California: 500 machine learning engineer jobs scraped
Jobs by site:
site
indeed    500
Name: count, dtype: int64
Completed at: 2025-11-30 21:45:55.291098
------------------------------------------------------------


In [4]:
print("Starting New York Machine Learning Engineer scrape...")
print(f"Time: {datetime.now()}")

ny_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="machine learning engineer",
    location="New York",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

ny_jobs['state'] = 'New York'
ny_jobs.to_csv('../data/raw/newyork_machine_learning_engineer_jobs.csv', index=False)
print(f"\n New York: {len(ny_jobs)} machine learning engineer jobs scraped")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting New York Machine Learning Engineer scrape...
Time: 2025-11-30 21:46:07.020379

 New York: 500 machine learning engineer jobs scraped
Completed at: 2025-11-30 21:46:10.635179
------------------------------------------------------------


In [5]:
print("Starting Texas Machine Learning Engineer scrape...")
print(f"Time: {datetime.now()}")

tx_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="machine learning engineer",
    location="Texas",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

tx_jobs['state'] = 'Texas'
tx_jobs.to_csv('../data/raw/texas_machine_learning_engineer_jobs.csv', index=False)
print(f"\n Texas: {len(tx_jobs)} machine learning engineer jobs scraped")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting Texas Machine Learning Engineer scrape...
Time: 2025-11-30 21:46:22.391101

 Texas: 500 machine learning engineer jobs scraped
Completed at: 2025-11-30 21:46:27.079518
------------------------------------------------------------


In [6]:
print("Combining all state data for Machine Learning Engineer positions...")

ca_jobs = pd.read_csv('../data/raw/california_machine_learning_engineer_jobs.csv')
ny_jobs = pd.read_csv('../data/raw/newyork_machine_learning_engineer_jobs.csv')
tx_jobs = pd.read_csv('../data/raw/texas_machine_learning_engineer_jobs.csv')

all_jobs = pd.concat([ca_jobs, ny_jobs, tx_jobs], ignore_index=True)
all_jobs.to_csv('../data/raw/all_states_machine_learning_engineer_jobs.csv', index=False)

print(f"\n Combined data saved!")
print(f"Total Machine Learning Engineer jobs: {len(all_jobs)}")
print(f"  - California: {len(ca_jobs)}")
print(f"  - New York: {len(ny_jobs)}")
print(f"  - Texas: {len(tx_jobs)}")
print("\n" + "=" * 60)

Combining all state data for Machine Learning Engineer positions...

 Combined data saved!
Total Machine Learning Engineer jobs: 1500
  - California: 500
  - New York: 500
  - Texas: 500



In [7]:
print("Creating Excel file with separate sheets for Machine Learning Engineer jobs...")

columns_to_keep = [
    'title', 'company', 'location', 'min_amount', 'max_amount', 'currency',
    'interval', 'salary_source', 'date_posted', 'job_type', 'is_remote', 
    'job_url', 'description'
]

try:
    import openpyxl
except ImportError:
    import subprocess
    import sys
    print("Installing openpyxl...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openpyxl", "--break-system-packages"])
    print(" openpyxl installed!")

with pd.ExcelWriter('../data/raw/machine_learning_engineer_jobs_by_state.xlsx', engine='openpyxl') as writer:
    ca_jobs[columns_to_keep].to_excel(writer, sheet_name='California', index=False)
    ny_jobs[columns_to_keep].to_excel(writer, sheet_name='New York', index=False)
    tx_jobs[columns_to_keep].to_excel(writer, sheet_name='Texas', index=False)
    all_with_state = all_jobs[columns_to_keep + ['state']].copy()
    all_with_state.to_excel(writer, sheet_name='All States', index=False)

print("\n Created Excel file: machine_learning_engineer_jobs_by_state.xlsx")
print("   - California: {} jobs".format(len(ca_jobs)))
print("   - New York: {} jobs".format(len(ny_jobs)))
print("   - Texas: {} jobs".format(len(tx_jobs)))
print("   - All States: {} jobs".format(len(all_jobs)))

Creating Excel file with separate sheets for Machine Learning Engineer jobs...

 Created Excel file: machine_learning_engineer_jobs_by_state.xlsx
   - California: 500 jobs
   - New York: 500 jobs
   - Texas: 500 jobs
   - All States: 1500 jobs


In [8]:
print("Creating summary report for Machine Learning Engineer positions...")

summary = {
    'State': ['California', 'New York', 'Texas', 'TOTAL'],
    'Total Jobs': [len(ca_jobs), len(ny_jobs), len(tx_jobs), len(all_jobs)],
    'Jobs with Salary': [
        ca_jobs['min_amount'].notna().sum(),
        ny_jobs['min_amount'].notna().sum(),
        tx_jobs['min_amount'].notna().sum(),
        all_jobs['min_amount'].notna().sum()
    ],
    'Avg Min Salary': [
        ca_jobs['min_amount'].mean(),
        ny_jobs['min_amount'].mean(),
        tx_jobs['min_amount'].mean(),
        all_jobs['min_amount'].mean()
    ],
    'Avg Max Salary': [
        ca_jobs['max_amount'].mean(),
        ny_jobs['max_amount'].mean(),
        tx_jobs['max_amount'].mean(),
        all_jobs['max_amount'].mean()
    ],
    'Remote Jobs': [
        ca_jobs['is_remote'].sum(),
        ny_jobs['is_remote'].sum(),
        tx_jobs['is_remote'].sum(),
        all_jobs['is_remote'].sum()
    ]
}

summary_df = pd.DataFrame(summary)

for col in ['Avg Min Salary', 'Avg Max Salary']:
    summary_df[col] = summary_df[col].apply(
        lambda x: f"${x:,.0f}" if pd.notna(x) else "N/A"
    )

print("\n" + "=" * 80)
print("MACHINE LEARNING ENGINEER JOB MARKET SUMMARY")
print("=" * 80)
print(summary_df.to_string(index=False))
print("=" * 80)

summary_df.to_csv('../data/raw/machine_learning_engineer_summary_report.csv', index=False)
print("\n Summary saved to: machine_learning_engineer_summary_report.csv")

Creating summary report for Machine Learning Engineer positions...

MACHINE LEARNING ENGINEER JOB MARKET SUMMARY
     State  Total Jobs  Jobs with Salary Avg Min Salary Avg Max Salary  Remote Jobs
California         500               406       $142,737       $236,520           69
  New York         500               478       $162,544       $237,302          136
     Texas         500               236       $131,967       $212,678           77
     TOTAL        1500              1120       $148,921       $231,830          282

 Summary saved to: machine_learning_engineer_summary_report.csv
