In [None]:
# Synthetic income dataset generation, regression training, and Seaborn visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

# Step 1: Generate synthetic dataset
np.random.seed(42)

cities = ['Seattle', 'Chicago', 'Houston', 'Denver', 'Boston', 'Atlanta']
states = ['WA', 'IL', 'TX', 'CO', 'MA', 'GA']
occupations = ['Software Engineer', 'Nurse', 'Electrician', 'Marketing Manager', 'Data Analyst', 'Teacher']

# Base salaries by occupation
occupation_base = {
    'Software Engineer': 95000,
    'Nurse': 70000,
    'Electrician': 60000,
    'Marketing Manager': 80000,
    'Data Analyst': 75000,
    'Teacher': 55000
}

# City adjustments
city_adjustment = {
    'Seattle': 8000,
    'Chicago': -2000,
    'Houston': 1000,
    'Denver': 3000,
    'Boston': 7000,
    'Atlanta': 2000
}

# State adjustments
state_adjustment = {
    'WA': 1500,
    'IL': -1000,
    'TX': 500,
    'CO': 1000,
    'MA': 2000,
    'GA': 800
}

# Generate samples
samples = []
for _ in range(1000):
    city = np.random.choice(cities)
    state = np.random.choice(states)
    occupation = np.random.choice(occupations)
    
    base = occupation_base[occupation]
    city_adj = city_adjustment[city]
    state_adj = state_adjustment[state]
    noise = np.random.normal(0, 5000)  # random variation
    
    income = base + city_adj + state_adj + noise
    samples.append([city, state, occupation, round(income, 2)])

df = pd.DataFrame(samples, columns=['City', 'State', 'Occupation', 'Income'])

# Save dataset to CSV
df.to_csv("synthetic_income.csv", index=False)