<a href="https://colab.research.google.com/github/immin0241/school_projects/blob/master/3_1_ai/wip_ai_jobs_wage_expectation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
import numpy as np
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv('/content/ai_job_dataset.csv')

korea_df = df[df['company_location'] == 'South Korea'].copy()

features = ['required_skills', 'experience_level', 'years_experience']
target = 'salary_usd'

X = korea_df[features]
y = korea_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"훈련 데이터 수: {len(X_train)}개, 테스트 데이터 수: {len(X_test)}개")

text_transformer = CountVectorizer(tokenizer=lambda x: [i.strip() for i in x.split(',')], max_features=300, binary=True)

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'required_skills'),
        ('cat', categorical_transformer, ['experience_level']),
        ('num', numeric_transformer, ['years_experience'])    ],
    remainder='passthrough' )


model = Sequential([
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])


pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = pipeline.fit(X_train, y_train,
                       regressor__epochs=200,
                       regressor__batch_size=16,
                       regressor__validation_split=0.2,
                       regressor__callbacks=[early_stopping],
                       regressor__verbose=1)

# Preprocess X_test before evaluating the model
X_test_processed = pipeline.named_steps['preprocessor'].transform(X_test)

loss, mae = model.evaluate(X_test_processed, y_test, verbose=0)
print(f"\n테스트 데이터에 대한 최종 Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"-> 모델의 연봉 예측치가 실제값과 평균적으로 ${mae:,.2f} 정도 차이남을 의미합니다.")


predictions = pipeline.predict(X_test)

results_df = pd.DataFrame({
    'Actual Salary': y_test.values.flatten(),
    'Predicted Salary': predictions.flatten()
})
results_df['Difference'] = results_df['Actual Salary'] - results_df['Predicted Salary']

print("\n--- 실제 연봉과 예측 연봉 비교 (상위 5개) ---")
print(results_df.head())

훈련 데이터 수: 577개, 테스트 데이터 수: 145개




Epoch 1/200
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 85783.3359 - mae: 85783.3359 - val_loss: 88676.5312 - val_mae: 88676.5312
Epoch 2/200
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 84557.7891 - mae: 84557.7891 - val_loss: 88666.9219 - val_mae: 88666.9219
Epoch 3/200
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 87376.7109 - mae: 87376.7109 - val_loss: 88603.8203 - val_mae: 88603.8203
Epoch 4/200
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 86380.7109 - mae: 86380.7109 - val_loss: 88319.9375 - val_mae: 88319.9375
Epoch 5/200
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 83586.6484 - mae: 83586.6484 - val_loss: 87443.3672 - val_mae: 87443.3672
Epoch 6/200
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 84531.4375 - mae: 84531.4375 - val_loss: 85347.2734 - val_mae: 85347

