In [1]:
from jobspy import scrape_jobs
import pandas as pd
from datetime import datetime

In [2]:
print("Starting California Database Administrator scrape...")
print(f"Time: {datetime.now()}")

ca_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="database administrator",
    location="California",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

ca_jobs['state'] = 'California'
ca_jobs.to_csv('../data/raw/california_database_administrator_jobs.csv', index=False)
print(f"\n California: {len(ca_jobs)} database administrator jobs scraped")
print(f"Jobs by site:\n{ca_jobs['site'].value_counts()}")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting California Database Administrator scrape...
Time: 2025-11-30 21:12:41.774458

 California: 105 database administrator jobs scraped
Jobs by site:
site
indeed    105
Name: count, dtype: int64
Completed at: 2025-11-30 21:12:43.466812
------------------------------------------------------------


In [3]:
print("Starting New York Database Administrator scrape...")
print(f"Time: {datetime.now()}")

ny_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="database administrator",
    location="New York",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

ny_jobs['state'] = 'New York'
ny_jobs.to_csv('../data/raw/newyork_database_administrator_jobs.csv', index=False)
print(f"\n New York: {len(ny_jobs)} database administrator jobs scraped")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting New York Database Administrator scrape...
Time: 2025-11-30 21:16:23.431660

 New York: 44 database administrator jobs scraped
Completed at: 2025-11-30 21:16:24.214661
------------------------------------------------------------


In [4]:
print("Starting Texas Database Administrator scrape...")
print(f"Time: {datetime.now()}")

tx_jobs = scrape_jobs(
    site_name=["indeed"],
    search_term="database administrator",
    location="Texas",
    results_wanted=500,
    country_indeed='USA',
    enforce_annual_salary=True,
    description_format="html",
    verbose=1
)

tx_jobs['state'] = 'Texas'
tx_jobs.to_csv('../data/raw/texas_database_administrator_jobs.csv', index=False)
print(f"\n Texas: {len(tx_jobs)} database administrator jobs scraped")
print(f"Completed at: {datetime.now()}")
print("-" * 60)

Starting Texas Database Administrator scrape...
Time: 2025-11-30 21:16:47.256205

 Texas: 154 database administrator jobs scraped
Completed at: 2025-11-30 21:16:48.754814
------------------------------------------------------------


In [5]:
print("Combining all state data for Database Administrator positions...")

ca_jobs = pd.read_csv('../data/raw/california_database_administrator_jobs.csv')
ny_jobs = pd.read_csv('../data/raw/newyork_database_administrator_jobs.csv')
tx_jobs = pd.read_csv('../data/raw/texas_database_administrator_jobs.csv')

all_jobs = pd.concat([ca_jobs, ny_jobs, tx_jobs], ignore_index=True)
all_jobs.to_csv('../data/raw/all_states_database_administrator_jobs.csv', index=False)

print(f"\n Combined data saved!")
print(f"Total Database Administrator jobs: {len(all_jobs)}")
print(f"  - California: {len(ca_jobs)}")
print(f"  - New York: {len(ny_jobs)}")
print(f"  - Texas: {len(tx_jobs)}")
print("\n" + "=" * 60)

Combining all state data for Database Administrator positions...

 Combined data saved!
Total Database Administrator jobs: 303
  - California: 105
  - New York: 44
  - Texas: 154



In [6]:
print("Creating Excel file with separate sheets for Database Administrator jobs...")

columns_to_keep = [
    'title', 'company', 'location', 'min_amount', 'max_amount', 'currency',
    'interval', 'salary_source', 'date_posted', 'job_type', 'is_remote', 
    'job_url', 'description'
]

try:
    import openpyxl
except ImportError:
    import subprocess
    import sys
    print("Installing openpyxl...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openpyxl", "--break-system-packages"])
    print(" openpyxl installed!")

with pd.ExcelWriter('../data/raw/database_administrator_jobs_by_state.xlsx', engine='openpyxl') as writer:
    ca_jobs[columns_to_keep].to_excel(writer, sheet_name='California', index=False)
    ny_jobs[columns_to_keep].to_excel(writer, sheet_name='New York', index=False)
    tx_jobs[columns_to_keep].to_excel(writer, sheet_name='Texas', index=False)
    all_with_state = all_jobs[columns_to_keep + ['state']].copy()
    all_with_state.to_excel(writer, sheet_name='All States', index=False)

print("\n Created Excel file: database_administrator_jobs_by_state.xlsx")
print("   - California: {} jobs".format(len(ca_jobs)))
print("   - New York: {} jobs".format(len(ny_jobs)))
print("   - Texas: {} jobs".format(len(tx_jobs)))
print("   - All States: {} jobs".format(len(all_jobs)))

Creating Excel file with separate sheets for Database Administrator jobs...

 Created Excel file: database_administrator_jobs_by_state.xlsx
   - California: 105 jobs
   - New York: 44 jobs
   - Texas: 154 jobs
   - All States: 303 jobs


In [7]:
print("Creating summary report for Database Administrator positions...")

summary = {
    'State': ['California', 'New York', 'Texas', 'TOTAL'],
    'Total Jobs': [len(ca_jobs), len(ny_jobs), len(tx_jobs), len(all_jobs)],
    'Jobs with Salary': [
        ca_jobs['min_amount'].notna().sum(),
        ny_jobs['min_amount'].notna().sum(),
        tx_jobs['min_amount'].notna().sum(),
        all_jobs['min_amount'].notna().sum()
    ],
    'Avg Min Salary': [
        ca_jobs['min_amount'].mean(),
        ny_jobs['min_amount'].mean(),
        tx_jobs['min_amount'].mean(),
        all_jobs['min_amount'].mean()
    ],
    'Avg Max Salary': [
        ca_jobs['max_amount'].mean(),
        ny_jobs['max_amount'].mean(),
        tx_jobs['max_amount'].mean(),
        all_jobs['max_amount'].mean()
    ],
    'Remote Jobs': [
        ca_jobs['is_remote'].sum(),
        ny_jobs['is_remote'].sum(),
        tx_jobs['is_remote'].sum(),
        all_jobs['is_remote'].sum()
    ]
}

summary_df = pd.DataFrame(summary)

for col in ['Avg Min Salary', 'Avg Max Salary']:
    summary_df[col] = summary_df[col].apply(
        lambda x: f"${x:,.0f}" if pd.notna(x) else "N/A"
    )

print("\n" + "=" * 80)
print("DATABASE ADMINISTRATOR JOB MARKET SUMMARY")
print("=" * 80)
print(summary_df.to_string(index=False))
print("=" * 80)

summary_df.to_csv('../data/raw/database_administrator_summary_report.csv', index=False)
print("\n Summary saved to: database_administrator_summary_report.csv")

Creating summary report for Database Administrator positions...

DATABASE ADMINISTRATOR JOB MARKET SUMMARY
     State  Total Jobs  Jobs with Salary Avg Min Salary Avg Max Salary  Remote Jobs
California         105                69       $110,858       $158,297           24
  New York          44                35       $103,184       $142,372            3
     Texas         154                61       $107,007       $143,953           27
     TOTAL         303               165       $107,807       $149,616           54

 Summary saved to: database_administrator_summary_report.csv
