# Data Engineering Playground

This notebook is for experimenting with data processing, transformation, and analysis for the job automation system.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add backend to path
sys.path.insert(0, str(Path.cwd().parent.parent / "backend"))

print("Data engineering environment ready!")

## Job Data Analysis

Analyze job posting data to understand patterns and improve automation.

In [None]:
# Load job data from Supabase or local files
# Example: Analyze job requirements, salary ranges, company patterns

# Placeholder for job data loading
job_data = pd.DataFrame({
    'title': ['Software Engineer', 'Data Scientist', 'Product Manager'],
    'company': ['TechCorp', 'DataInc', 'ProductCo'],
    'salary_min': [80000, 90000, 100000],
    'salary_max': [120000, 140000, 150000],
    'remote': [True, True, False]
})

print("Sample job data:")
print(job_data.head())

In [None]:
# Analyze job title frequencies
title_counts = job_data['title'].value_counts()
print("Job title distribution:")
print(title_counts)

# Visualize
plt.figure(figsize=(10, 6))
title_counts.plot(kind='bar')
plt.title('Job Title Distribution')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Company Analysis

Analyze company patterns and characteristics.

In [None]:
# Analyze salary ranges by company
job_data['salary_avg'] = (job_data['salary_min'] + job_data['salary_max']) / 2

plt.figure(figsize=(12, 6))
sns.barplot(data=job_data, x='company', y='salary_avg')
plt.title('Average Salary by Company')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Data Quality Assessment

Assess data quality and identify issues.

In [None]:
# Check for missing data
print("Missing data summary:")
print(job_data.isnull().sum())

# Check data types
print("\nData types:")
print(job_data.dtypes)

# Basic statistics
print("\nBasic statistics:")
print(job_data.describe())

## Export Processed Data

Save cleaned and processed data for use in the application.

In [None]:
# Save processed data
output_dir = Path.cwd().parent / "processed"
output_dir.mkdir(exist_ok=True)

job_data.to_csv(output_dir / "processed_jobs.csv", index=False)
print(f"Processed data saved to {output_dir / 'processed_jobs.csv'}")