In [3]:
# ====================
# Imports & Setup
# ====================
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
import joblib

pd.options.display.max_columns = 200

# ====================
# Load Data
# ====================
df = pd.read_csv("future_job_trend_with_types.csv")  # ganti sesuai nama file
print(df.head())
print(df.info())

   Year            Job_Type  Tech_Index  AI_Adoption  Automation_Level  \
0  2020   Financial Analyst       62.98        63.54             81.38   
1  2028  Software Developer       60.33        63.49             56.74   
2  2030          Accountant       84.37        52.80             73.33   
3  2025     Project Manager       62.98        94.48             51.81   
4  2026    Digital Marketer       68.57        40.98             31.27   

   Education_Index  Remote_Work_Index  Market_Demand  Job_Trend_Score  
0            66.29              48.40          63.96            68.67  
1            60.11              70.86          55.00            72.80  
2            45.52              34.65          63.29            53.32  
3            72.84              38.08          74.08            63.49  
4            75.71              33.47          78.06            65.43  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 9 columns):
 #   Column        

In [4]:
# ====================
# Feature & Target
# ====================
features = ['Year','Job_Type','Tech_Index','AI_Adoption','Automation_Level',
            'Education_Index','Remote_Work_Index','Market_Demand']
target = 'Job_Trend_Score'

X = df[features].copy()
y = df[target].copy()

# Split kolom numerik & kategorikal
cat_cols = ['Job_Type']
num_cols = ['Year','Tech_Index','AI_Adoption','Automation_Level',
            'Education_Index','Remote_Work_Index','Market_Demand']

In [6]:
# ====================
# Preprocessing
# ====================
num_pipe = Pipeline([('scaler', StandardScaler())])
cat_pipe = Pipeline([('ohe', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])


In [7]:
# ====================
# Train-Test Split
# ====================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
# ====================
# Random Forest Regressor + Hyperparameter Tuning
# ====================
reg = Pipeline([
    ('prep', preprocessor),
    ('rf', RandomForestRegressor(random_state=42))
])

param_grid = {
    'rf__n_estimators': [200, 300, 500],
    'rf__max_depth': [10, 20, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

search = RandomizedSearchCV(
    reg, param_distributions=param_grid,
    n_iter=15, cv=5, scoring='r2', n_jobs=-1, random_state=42
)

search.fit(X_train, y_train)
best_reg = search.best_estimator_

y_pred = best_reg.predict(X_test)

print("\n=== Random Forest Regression (Best Model) ===")
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2:", r2_score(y_test, y_pred))
print("Best Params:", search.best_params_)



=== Random Forest Regression (Best Model) ===
RMSE: 7.291864863697873
R2: 0.5907557438618005
Best Params: {'rf__n_estimators': 300, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1, 'rf__max_depth': 20}




In [9]:
# ====================
# Gradient Boosting Regressor (comparison)
# ====================
gbr = Pipeline([
    ('prep', preprocessor),
    ('gbr', GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, random_state=42))
])

gbr.fit(X_train, y_train)
gbr_pred = gbr.predict(X_test)

print("\n=== Gradient Boosting Regression ===")
print("RMSE:", mean_squared_error(y_test, gbr_pred, squared=False))
print("R2:", r2_score(y_test, gbr_pred))



=== Gradient Boosting Regression ===
RMSE: 5.98559283526369
R2: 0.7242473392393989




In [10]:
# ====================
# Clustering (job grouping)
# ====================
X_full = preprocessor.fit_transform(X)
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X_full)

print("\n=== Cluster Top Job_Type per Cluster ===")
print(df.groupby('cluster')['Job_Type'].agg(lambda x: x.value_counts().index[0]).to_frame('top_job'))





=== Cluster Top Job_Type per Cluster ===
                top_job
cluster                
0        Civil Engineer
1        Data Scientist
2        Civil Engineer


In [12]:
# ====================
# Save Models
# ====================
joblib.dump(best_reg, "rf_jobtrend_reg_best.joblib")
joblib.dump(gbr, "gbr_jobtrend_reg.joblib")
print("\nModels saved: rf_jobtrend_reg_best.joblib, gbr_jobtrend_reg.joblib")


Models saved: rf_jobtrend_reg_best.joblib, gbr_jobtrend_reg.joblib


In [None]:
# %% [markdown]
# # Prediksi Tren Pekerjaan Masa Depan
# Load model .joblib dan lakukan prediksi dengan data baru

# %%
import joblib
import pandas as pd

# Load model
model = joblib.load(".joblib")
rf_jobtrend_reg_best
# Contoh data baru untuk prediksi
# Pastikan urutannya sama dengan saat training:
# ['Year','Tech_Index','AI_Adoption','Automation_Level','Education_Index','Remote_Work_Index','Market_Demand']

new_data = pd.DataFrame([{
    "Year": 2030,
    "Job_Type": "AI Engineer",
    "Tech_Index": 85,
    "AI_Adoption": 90,
    "Automation_Level": 70,
    "Education_Index": 88,
    "Remote_Work_Index": 80,
    "Market_Demand": 92
}])

# Prediksi
prediction = model.predict(new_data)

print("Hasil Prediksi Job_Trend_Score untuk data baru:", prediction[0])


Hasil Prediksi Job_Trend_Score untuk data baru: 80.49913333333335


In [None]:
# jajal.py
import streamlit as st
import pandas as pd
import joblib
import matplotlib.pyplot as plt

# ====================
# Load Model
# ====================
model = joblib.load("rf_jobtrend_reg_best.joblib")  # pastikan file ini ada di folder yang sama

st.set_page_config(page_title="📊 Dashboard Prediksi Tren Pekerjaan", layout="wide")
st.title("📊 Prediksi Tren Pekerjaan")

st.markdown("Isi parameter berikut untuk memprediksi **Job Trend Score**:")

# ====================
# Sidebar Input
# ====================
st.sidebar.header("Input Data")

year = st.sidebar.number_input("Tahun", min_value=2020, max_value=2050, value=2025, step=1)

job_type = st.sidebar.selectbox("Job Type", [
    "Data Scientist", "Software Engineer", "AI Specialist",
    "Automation Engineer", "Business Analyst", "Other"
])

tech_index = st.sidebar.slider("Tech Index", 0, 100, 50)
ai_adoption = st.sidebar.slider("AI Adoption", 0, 100, 40)
automation_level = st.sidebar.slider("Automation Level", 0, 100, 30)
education_index = st.sidebar.slider("Education Index", 0, 100, 60)
remote_work_index = st.sidebar.slider("Remote Work Index", 0, 100, 50)
market_demand = st.sidebar.slider("Market Demand", 0, 100, 70)

# ====================
# Buat DataFrame Input
# ====================
input_data = pd.DataFrame({
    "Year": [year],
    "Job_Type": [job_type],
    "Tech_Index": [tech_index],
    "AI_Adoption": [ai_adoption],
    "Automation_Level": [automation_level],
    "Education_Index": [education_index],
    "Remote_Work_Index": [remote_work_index],
    "Market_Demand": [market_demand]
})

st.subheader("📥 Data Input")
st.write(input_data)

# ====================
# Prediksi
# ====================
if st.button("🔮 Prediksi Job Trend Score"):
    prediction = model.predict(input_data)[0]
    st.subheader("🔮 Hasil Prediksi")
    st.success(f"Job Trend Score diprediksi: **{prediction:.2f}**")

    # ====================
    # Visualisasi
    # ====================
    st.subheader("📈 Visualisasi Market Demand vs Automation Level")
    fig, ax = plt.subplots()
    ax.scatter(market_demand, automation_level, color="blue", s=100, label="Input Data")
    ax.set_xlabel("Market Demand")
    ax.set_ylabel("Automation Level")
    ax.legend()
    st.pyplot(fig)


2025-09-02 09:00:16.166 
  command:

    streamlit run c:\ProgramData\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-09-02 09:00:16.168 No runtime found, using MemoryCacheStorageManager
2025-09-02 09:00:16.172 No runtime found, using MemoryCacheStorageManager
2025-09-02 09:00:16.176 No runtime found, using MemoryCacheStorageManager
2025-09-02 09:00:16.179 No runtime found, using MemoryCacheStorageManager
2025-09-02 09:00:16.187 No runtime found, using MemoryCacheStorageManager
2025-09-02 09:00:16.540 No runtime found, using MemoryCacheStorageManager


DeltaGenerator()

In [None]:
import os
st.write("Working Directory:", os.getcwd())
st.write("Files in folder:", os.listdir())


Defaulting to user installation because normal site-packages is not writeable
Collecting pyzmq
  Using cached pyzmq-27.0.2-cp312-abi3-win_amd64.whl.metadata (6.0 kB)
Using cached pyzmq-27.0.2-cp312-abi3-win_amd64.whl (619 kB)
Installing collected packages: pyzmq
  Attempting uninstall: pyzmq
    Found existing installation: pyzmq 27.0.2
    Uninstalling pyzmq-27.0.2:
      Successfully uninstalled pyzmq-27.0.2
Successfully installed pyzmq-27.0.2


  You can safely remove it manually.
