# Predikcija plata u ekonomski razvijenim državama

**Autori:** Katarina Perović E2 131/2024, Milica Petrović E2 124/2024, Ana Radovanović E2 158/2024

**Predmet:** SIAP 

**1.Uvoz biblioteka**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys  
from IPython.display import display
!{sys.executable} -m pip install seaborn

import seaborn as sns


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

## 2. Učitavanje, osnovna analiza i vizualizacija podataka

In [None]:
df = pd.read_csv("Salary.csv")
df.head()
df.info()
df.describe()

In [None]:
df["Education Level"] = df["Education Level"].replace(
    ["Bachelor's Degree", "Master's Degree", "phD"],
    ["Bachelor's", "Master's", "PhD"]
)

edu_order = ["High School", "Bachelor's", "Master's", "PhD"]
df["Education Level"] = pd.Categorical(
    df["Education Level"], 
    categories=edu_order, 
    ordered=True
)

num_cols = ["Age", "Years of Experience", "Salary"]
df[num_cols].hist(bins=30, figsize=(12,6), edgecolor="black")
plt.suptitle("Distribucije numeričkih varijabli")
plt.show()

fig, axes = plt.subplots(1, 3, figsize=(15, 4)) 

sns.countplot(
    x="Education Level", hue="Education Level", data=df, 
    order=edu_order, legend=False,
    palette="muted", ax=axes[0]
)
axes[0].set_title("Distribucija obrazovanja")
axes[0].set_xlabel("Education Level")
axes[0].set_ylabel("Broj ljudi")

sns.barplot(
    x="Education Level", y="Salary", hue="Education Level", data=df,
    estimator="mean", order=edu_order, legend=False,
    palette="muted", ax=axes[1]
)
axes[1].set_title("Prosečna plata po obrazovanju")
axes[1].set_xlabel("Education Level")
axes[1].set_ylabel("Prosečna plata")

sns.barplot(
    x="Gender", y="Salary", hue="Gender", data=df,
    estimator="mean", legend=False,
    palette="muted", ax=axes[2]
)
axes[2].set_title("Prosečna plata po polu")
axes[2].set_xlabel("Pol")
axes[2].set_ylabel("Prosečna plata")

plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
corr = df[["Age", "Years of Experience", "Salary"]].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Korelaciona matrica")
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(14, 5)) 

sns.boxplot(
    x="Education Level", y="Salary", hue="Education Level",
    data=df, order=edu_order, legend=False,
    palette="muted", ax=axes[0]
)
axes[0].set_title("Raspon plata po obrazovanju")

sns.boxplot(
    x="Gender", y="Salary", hue="Gender",
    data=df, legend=False,
    palette="muted", ax=axes[1]
)
axes[1].set_title("Raspon plata po polu")

plt.tight_layout()
plt.show()


## 3. Pretprocesiranje podataka

In [None]:

df = df.dropna()
df = df[~(df["Gender"] == "Other")]
df = df.drop(columns=["Race", "Country"], errors="ignore")

le_gender = LabelEncoder()
df["Gender"] = le_gender.fit_transform(df["Gender"])

edu_order = ["High School", "Bachelor's", "Master's", "PhD"]
df["Education Level"] = pd.Categorical(df["Education Level"], categories=edu_order, ordered=True)
df["Education Level"] = df["Education Level"].cat.codes

df = pd.get_dummies(df, columns=["Job Title"], drop_first=True)

X = df.drop(columns=["Salary"])
y = df["Salary"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

