# Predikcija plata u ekonomski razvijenim državama

**Autori:** Katarina Perović E2 131/2024, Milica Petrović E2 124/2024, Ana Radovanović E2 158/2024

**Predmet:** SIAP 

**1.Uvoz biblioteka**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys  
!{sys.executable} -m pip install seaborn

import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from IPython.display import display
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from skopt import BayesSearchCV

## 2. Učitavanje, osnovna analiza i vizualizacija podataka

In [None]:
df = pd.read_csv("Salary.csv")
df.head()
df.info()
df[["Age", "Years of Experience", "Salary"]].describe()


In [None]:
df["Education Level"] = df["Education Level"].replace(
    ["Bachelor's Degree", "Master's Degree", "phD"],
    ["Bachelor's", "Master's", "PhD"]
)

edu_order = ["High School", "Bachelor's", "Master's", "PhD"]
df["Education Level"] = pd.Categorical(
    df["Education Level"], 
    categories=edu_order, 
    ordered=True
)

num_cols = ["Age", "Years of Experience", "Salary"]
df[num_cols].hist(bins=30, figsize=(12,6), edgecolor="black")
plt.suptitle("Distribucije numeričkih varijabli")
plt.show()


gender_counts = df["Gender"].value_counts()

color_map = {
    "Male": "#A9D6E5",    
    "Female": "#FFADAD",  
    "Other": "#527539"    
}

colors = [color_map.get(g, "#D3D3D3") for g in gender_counts.index]  

explode = [0.1 if i == gender_counts.idxmax() else 0 for i in gender_counts.index]

plt.figure(figsize=(6, 6))
plt.pie(
    gender_counts,
    labels=gender_counts.index,
    colors=colors,
    autopct=lambda p: f'{p:.1f}%  ({int(p * sum(gender_counts) / 100)})',
    explode=explode
)
plt.axis("equal")
plt.title("Raspodela pola u podacima")
plt.show()

fig, axes = plt.subplots(1, 3, figsize=(15, 4)) 

sns.countplot(
    x="Education Level", hue="Education Level", data=df, 
    order=edu_order, legend=False,
    palette="muted", ax=axes[0]
)
axes[0].set_title("Distribucija obrazovanja")
axes[0].set_xlabel("Education Level")
axes[0].set_ylabel("Broj ljudi")

for p in axes[0].patches:
    axes[0].annotate(
        f'{int(p.get_height())}', 
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha='center', va='bottom'
    )

sns.barplot(
    x="Education Level", y="Salary", hue="Education Level", data=df,
    estimator="mean", order=edu_order, legend=False,
    palette="muted", ax=axes[1]
)
axes[1].set_title("Prosečna plata po obrazovanju")
axes[1].set_xlabel("Education Level")
axes[1].set_ylabel("Prosečna plata")

for p in axes[1].patches:
    value = p.get_height()
    axes[1].annotate(
        f'{value:.2f}',   
        (p.get_x() + p.get_width() / 2., value), 
        ha='center', va='bottom'
    )

sns.barplot(
    x="Gender", y="Salary", hue="Gender", data=df,
    estimator="mean", legend=False,
    palette="muted", ax=axes[2]
)
axes[2].set_title("Prosečna plata po polu")
axes[2].set_xlabel("Pol")
axes[2].set_ylabel("Prosečna plata")

for p in axes[2].patches:
    value = p.get_height()
    axes[2].annotate(
        f'{value:.2f}',
        (p.get_x() + p.get_width() / 2., value), 
        ha='center', va='bottom'
    )


plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
corr = df[["Age", "Years of Experience", "Salary"]].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Korelaciona matrica")
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(14, 5)) 

sns.boxplot(
    x="Education Level", y="Salary", hue="Education Level",
    data=df, order=edu_order, legend=False,
    palette="muted", ax=axes[0]
)
axes[0].set_title("Raspon plata po obrazovanju")

sns.boxplot(
    x="Gender", y="Salary", hue="Gender",
    data=df, legend=False,
    palette="muted", ax=axes[1]
)
axes[1].set_title("Raspon plata po polu")

plt.tight_layout()
plt.show()



## 3. Pretprocesiranje podataka

In [None]:
df = df.dropna()
df = df[~(df["Gender"] == "Other")]
df = df.drop(columns=["Race", "Country"], errors="ignore")

print("Broj redova pre izbacivanja outliera za platu:", len(df))
lower = df["Salary"].quantile(0.01)
upper = df["Salary"].quantile(0.99)
df = df[(df["Salary"] >= lower) & (df["Salary"] <= upper)]
print("Broj redova nakon izbacivanja outliera:", len(df))


print("Broj redova pre izbacivanja retkih poslova:", len(df))
print("Ukupno različitih poslova pre:", df["Job Title"].nunique())

job_counts = df["Job Title"].value_counts()
df = df[df["Job Title"].isin(job_counts[job_counts >= 50].index)]

print("Broj redova nakon izbacivanja retkih poslova:", len(df))
print("Ukupno različitih poslova posle:", df["Job Title"].nunique())


X = df.drop(columns=["Salary"])
y = df["Salary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)


y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid)
y_test_log  = np.log1p(y_test)


gender_col = ["Gender"]
edu_col = ["Education Level"]
job_col = ["Job Title"]

edu_order = [["High School", "Bachelor's", "Master's", "PhD"]]


preprocessor = ColumnTransformer(
    transformers=[
        ('edu', OrdinalEncoder(categories=edu_order), edu_col),
        ('gender', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), gender_col),
        ('job', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), job_col)
    ],
    remainder='passthrough'  
)

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('scaler', StandardScaler())
])

X_train_scaled = pipeline.fit_transform(X_train)
X_valid_scaled = pipeline.transform(X_valid)
X_test_scaled  = pipeline.transform(X_test)


## 4. KNN

In [None]:
#knn

param_grid = {
    'n_neighbors': [5, 10, 15, 20, 30],
    'weights': ['uniform', 'distance'],
    'p': [1, 2, 3]
}
knn = KNeighborsRegressor()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train_scaled, y_train_log)

print("Najbolji parametri:", grid.best_params_)

best_knn = grid.best_estimator_


y_pred_valid_log = best_knn.predict(X_valid_scaled)
y_pred_valid = np.expm1(y_pred_valid_log)  

valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
valid_r2 = r2_score(y_valid, y_pred_valid)

valid_mae = mean_absolute_error(y_valid, y_pred_valid)
valid_mape = mean_absolute_percentage_error(y_valid, y_pred_valid)
print(f"RMSE na validacionom skupu: {valid_rmse:.2f}")
print(f"R² na validacionom skupu: {valid_r2:.4f}")
print(f"MAE na validacionom skupu: {valid_mae:.2f}")
print(f"MAPE na validacionom skupu: {valid_mape:.2%}")


y_pred_test_log = best_knn.predict(X_test_scaled)
y_pred_test = np.expm1(y_pred_test_log)

test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
test_r2 = r2_score(y_test, y_pred_test)

test_mae = mean_absolute_error(y_test, y_pred_test)
test_mape = mean_absolute_percentage_error(y_test, y_pred_test)
print(f"RMSE na test skupu: {test_rmse:.2f}")
print(f"R² na test skupu: {test_r2:.4f}")
print(f"MAE na test skupu: {test_mae:.2f}")
print(f"MAPE na test skupu: {test_mape:.2%}")



## 5. Decision Tree

In [None]:
#Decision Tree

dt = DecisionTreeRegressor(random_state=42)

param_grid_dt = {
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': [None, 'sqrt', 'log2']
}

grid_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='r2', n_jobs=-1)
grid_dt.fit(X_train_scaled, y_train_log)

print("\nNajbolji parametri Decision Tree:", grid_dt.best_params_)
best_dt = grid_dt.best_estimator_

y_pred_valid_log = best_dt.predict(X_valid_scaled)
y_pred_valid = np.expm1(y_pred_valid_log)
valid_mae = mean_absolute_error(y_valid, y_pred_valid)
valid_mape = mean_absolute_percentage_error(y_valid, y_pred_valid)
valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
valid_r2 = r2_score(y_valid, y_pred_valid)
print(f"DT RMSE na validacionom skupu: {valid_rmse:.2f}")
print(f"DT R² na validacionom skupu: {valid_r2:.4f}")
print(f"MAE na validacionom skupu: {valid_mae:.2f}")
print(f"MAPE na validacionom skupu: {valid_mape:.2%}")

y_pred_test_log = best_dt.predict(X_test_scaled)
y_pred_test = np.expm1(y_pred_test_log)
test_mae = mean_absolute_error(y_test, y_pred_test)
test_mape = mean_absolute_percentage_error(y_test, y_pred_test)
test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
test_r2 = r2_score(y_test, y_pred_test)
print(f"DT RMSE na test skupu: {test_rmse:.2f}")
print(f"DT R² na test skupu: {test_r2:.4f}")
print(f"MAE na test skupu: {test_mae:.2f}")
print(f"MAPE na test skupu: {test_mape:.2%}")


## 6. Random forest

In [None]:

#rf

rf = RandomForestRegressor(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='r2', n_jobs=-1)
grid_rf.fit(X_train_scaled, y_train_log)

print("Najbolji parametri RF:", grid_rf.best_params_)

best_rf = grid_rf.best_estimator_


y_pred_valid_log = best_rf.predict(X_valid_scaled)
y_pred_valid = np.expm1(y_pred_valid_log)  

valid_mae = mean_absolute_error(y_valid, y_pred_valid)
valid_mape = mean_absolute_percentage_error(y_valid, y_pred_valid)
valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
valid_r2 = r2_score(y_valid, y_pred_valid)
print(f"RF RMSE na validacionom skupu: {valid_rmse:.2f}")
print(f"RF R² na validacionom skupu: {valid_r2:.4f}")
print(f"MAE na validacionom skupu: {valid_mae:.2f}")
print(f"MAPE na validacionom skupu: {valid_mape:.2%}")

y_pred_test_log = best_rf.predict(X_test_scaled)
y_pred_test = np.expm1(y_pred_test_log)

test_mae = mean_absolute_error(y_test, y_pred_test)
test_mape = mean_absolute_percentage_error(y_test, y_pred_test)
test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
test_r2 = r2_score(y_test, y_pred_test)
print(f"RF RMSE na test skupu: {test_rmse:.2f}")
print(f"RF R² na test skupu: {test_r2:.4f}")
print(f"MAE na test skupu: {test_mae:.2f}")
print(f"MAPE na test skupu: {test_mape:.2%}")


## 7. XGBoost

In [None]:
#xgboost


xgb = XGBRegressor(random_state=42, objective='reg:squarederror')


search_spaces = {
    'n_estimators': (300, 1000),
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.2, 'log-uniform'),
    'subsample': (0.6, 1.0, 'uniform'),
    'colsample_bytree': (0.6, 1.0, 'uniform'),
    'min_child_weight': (1, 10)
}

bayes_search = BayesSearchCV(
    estimator=xgb,
    search_spaces=search_spaces,
    n_iter=50,        
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=1
)


bayes_search.fit(X_train_scaled, y_train_log)

print("Najbolji parametri XGB (Bayes):", bayes_search.best_params_)

best_xgb = bayes_search.best_estimator_

y_pred_valid_log = best_xgb.predict(X_valid_scaled)
y_pred_valid = np.expm1(y_pred_valid_log)

valid_mae = mean_absolute_error(y_valid, y_pred_valid)
valid_mape = mean_absolute_percentage_error(y_valid, y_pred_valid)
valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
valid_r2 = r2_score(y_valid, y_pred_valid)
print(f"XGB RMSE na validacionom skupu: {valid_rmse:.2f}")
print(f"XGB R² na validacionom skupu: {valid_r2:.4f}")
print(f"MAE na validacionom skupu: {valid_mae:.2f}")
print(f"MAPE na validacionom skupu: {valid_mape:.2%}")

y_pred_test_log = best_xgb.predict(X_test_scaled)
y_pred_test = np.expm1(y_pred_test_log)

test_mae = mean_absolute_error(y_test, y_pred_test)
test_mape = mean_absolute_percentage_error(y_test, y_pred_test)
test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
test_r2 = r2_score(y_test, y_pred_test)
print(f"XGB RMSE na test skupu: {test_rmse:.2f}")
print(f"XGB R² na test skupu: {test_r2:.4f}")
print(f"MAE na test skupu: {test_mae:.2f}")
print(f"MAPE na test skupu: {test_mape:.2%}")


## 8.Linearna regresija

In [None]:
#Linearna regresija

lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train_log)

y_pred_valid_log = lin_reg.predict(X_valid_scaled)
y_pred_valid = np.expm1(y_pred_valid_log)

valid_mae = mean_absolute_error(y_valid, y_pred_valid)
valid_mape = mean_absolute_percentage_error(y_valid, y_pred_valid)
valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
valid_r2 = r2_score(y_valid, y_pred_valid)
print("\n--- Linearna regresija ---")
print(f"RMSE na validacionom skupu: {valid_rmse:.2f}")
print(f"R² na validacionom skupu: {valid_r2:.4f}")
print(f"MAE na validacionom skupu: {valid_mae:.2f}")
print(f"MAPE na validacionom skupu: {valid_mape:.2%}")

y_pred_test_log = lin_reg.predict(X_test_scaled)
y_pred_test = np.expm1(y_pred_test_log)

test_mae = mean_absolute_error(y_test, y_pred_test)
test_mape = mean_absolute_percentage_error(y_test, y_pred_test)
test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
test_r2 = r2_score(y_test, y_pred_test)
print(f"RMSE na test skupu: {test_rmse:.2f}")
print(f"R² na test skupu: {test_r2:.4f}")
print(f"MAE na test skupu: {test_mae:.2f}")
print(f"MAPE na test skupu: {test_mape:.2%}")



# Ridge
ridge = Ridge(random_state=42)

param_grid_ridge = {
    'alpha': [0.01, 0.1, 1, 10, 100, 500, 1000] 
}

grid_ridge = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='r2', n_jobs=-1)
grid_ridge.fit(X_train_scaled, y_train_log)

print("Najbolji parametri Ridge:", grid_ridge.best_params_)
best_ridge = grid_ridge.best_estimator_

y_pred_valid_log = best_ridge.predict(X_valid_scaled)
y_pred_valid = np.expm1(y_pred_valid_log)

valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
valid_r2 = r2_score(y_valid, y_pred_valid)
print(f"Ridge RMSE na validacionom skupu: {valid_rmse:.2f}")
print(f"Ridge R² na validacionom skupu: {valid_r2:.4f}")

y_pred_test_log = best_ridge.predict(X_test_scaled)
y_pred_test = np.expm1(y_pred_test_log)

test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
test_r2 = r2_score(y_test, y_pred_test)
print(f"Ridge RMSE na test skupu: {test_rmse:.2f}")
print(f"Ridge R² na test skupu: {test_r2:.4f}")


# Lasso
lasso = Lasso(random_state=42, max_iter=10000)

param_grid_lasso = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
}

grid_lasso = GridSearchCV(lasso, param_grid_lasso, cv=5, scoring='r2', n_jobs=-1)
grid_lasso.fit(X_train_scaled, y_train_log)

print("\nNajbolji parametri Lasso:", grid_lasso.best_params_)
best_lasso = grid_lasso.best_estimator_

y_pred_valid_log = best_lasso.predict(X_valid_scaled)
y_pred_valid = np.expm1(y_pred_valid_log)

valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
valid_r2 = r2_score(y_valid, y_pred_valid)
print(f"Lasso RMSE na validacionom skupu: {valid_rmse:.2f}")
print(f"Lasso R² na validacionom skupu: {valid_r2:.4f}")

y_pred_test_log = best_lasso.predict(X_test_scaled)
y_pred_test = np.expm1(y_pred_test_log)

test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
test_r2 = r2_score(y_test, y_pred_test)
print(f"Lasso RMSE na test skupu: {test_rmse:.2f}")
print(f"Lasso R² na test skupu: {test_r2:.4f}")

