In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from keras.optimizers.legacy import Adam

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Objective
This study endeavors to construct a predictive model for estimating the salaries of Data Science positions worldwide, denominated in US dollars (USD).
In this project, a comprehensive dataset contained within a CSV file is explored, encompassing a rich array of information on job roles from a global perspective. The dataset comprises 3,756 records, each detailing various attributes associated with employment positions. These attributes serve as indicators for determining the annual salary (in USD) for each job listed.

## Read Data From Source

In [None]:
data = pd.read_csv('./SalarayDataSet/ds_salaries.csv')

## Data Features
The dataset contains the following features:

1. **work_year**: The year the salary was paid.

2. **experience_level**: The experience level in the job during the year.

3. **employment_type**: The type of employment for the role.

4. **job_title**: The role worked in during the year.

5. **salary**: The total gross salary amount paid.

6. **salary_currency**: The currency of the salary paid as an ISO 4217 currency code.

7. **salary_in_usd**: The salary in USD.

8. **employee_residence**: Employee's primary country of residence in during the work year as an ISO 3166 country code.

9. **remote_ratio**: The overall amount of work done remotely.

10. **company_location**: The country of the employer's main office or contracting branch.

11. **company_size**: The median number of people that worked for the company during the year.

In [None]:
data.head()

# Exploratory Data Analysis (EDA) - Salary Dataset

## Data Overview
The dataset contains 3755 entries and 11 columns. Below is the description of each column:

In [None]:
data.info()

## Verification Of Non-Empty
In order to encode and process the data so that it will be ready for the model, we verify there are no Nan cells.
As shown bellow, all fileds are filled.

In [None]:
data[data.isna().any(axis=1)]

## Distribution of Salary (in USD)


In [None]:
sns.set_style("whitegrid")

plt.figure(figsize=(20, 5))

plt.subplot(1, 2, 1)
sns.histplot(data=data, kde=True, bins=30, x='salary_in_usd')
plt.title('Distribution of Salary (in USD)')
plt.xlabel('Salary (in USD)')
plt.ylabel('Count')

plt.show()

## Mean Salary by Position

In [None]:
mean_salary_by_position = data.groupby('job_title')['salary_in_usd'].mean()

plt.figure(figsize=(20,5))
mean_salary_by_position.sort_values(ascending=False).plot(kind='bar')
plt.title('Mean Salary by Position')
plt.ylabel('Mean Salary (in USD)')
plt.xlabel('Position')
plt.show()

## Distribution of Experience Level

In [None]:
plt.figure(figsize=(10, 5))

plt.subplot(2, 2, 1)
sns.countplot(x='experience_level', data=data, order=data['experience_level'].value_counts().index)
plt.title('Distribution of Experience Level')
plt.xlabel('Experience Level')
plt.ylabel('Count')
plt.show()

## Distribution of Salary in USD by Work Year

In [None]:
sns.scatterplot(data=data, x='work_year', y='salary_in_usd', alpha=0.6, legend=False)
plt.title('Salary vs Years of Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary (in USD)')
plt.show()

# Data Normalizing and Preprocessing
In order to normalize the data for the training, validation and testing of the model, we identify numeric and categorical features in the dataset and apply appropriate preprocessing transformations.

## Categorical features

In [None]:
objects = data.select_dtypes(include='object')
objects.head()

## Normalizing Salary Columns
This section normalizes the 'salary' and 'salary_in_usd' columns by dividing their values by 100,000. Normalization is performed to scale the values within a similar range, which can help improve the convergence of machine learning algorithms during training.

In [None]:
cols_to_normalize = ['salary', 'salary_in_usd']
data[cols_to_normalize] = data[cols_to_normalize] / 100000

data[cols_to_normalize]

## Preprocessing and Splitting

In this section, we preprocess the dataset and split it into training and validation sets.

### Feature-Target Separation
We separate the feature matrix `data_X` from the target variable `y` in the dataset.

In [None]:
data_X = data.drop('salary_in_usd', axis=1)
y = data['salary_in_usd']

### Numeric and Categorical Feature Processing
We identify numeric and categorical features in the dataset and apply appropriate preprocessing transformations.

In [None]:
numeric_features = data_X.select_dtypes(include=[np.number])
numeric_transformer = StandardScaler()

categorical_features = data_X.select_dtypes(include=['object'])
categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features.columns),
        ('cat', categorical_transformer, categorical_features.columns)
    ])

### Dataset Splitting
We split the dataset into training and validation sets using a specified test size and random state.

In [None]:
indices = data.index
train_indices, valid_indices = train_test_split(indices, test_size=0.3, random_state=42)
x_train, y_train = data_X.loc[train_indices, :], y.loc[train_indices]
x_valid, y_valid = data_X.loc[valid_indices, :], y.loc[valid_indices]


### Applying Transformers 
We apply the preprocessing transformers to the training and validation feature sets.

In [None]:
X_train_preprocessed = preprocessor.fit_transform(x_train)
X_valid_preprocessed = preprocessor.transform(x_valid)

# Convolutional Neural Network (CNN) Modeling

This section outlines the implementation of Convolutional Neural Network (CNN) model.

## CNN Model Definition
CNN model is defined using the Keras Sequential API. The model consists of convolutional layers with ReLU activation functions, max-pooling layers for downsampling, and dense layers for regression.

In [None]:
cnn_model = Sequential([
    Conv1D(32, 3, activation='relu', input_shape=(248, 1)),
    MaxPooling1D(2),

    Conv1D(64, 3, activation='relu'),
    MaxPooling1D(2),

    Conv1D(128, 3, activation='relu'),
    MaxPooling1D(2),

    Flatten(),

    Dense(64, activation='relu'),

    Dense(1)
])

## Model Compilation
The CNN model is compiled using the Adam optimizer, mean squared error (MSE) as the loss function, and mean absolute error (MAE) and mean squared error (MSE) as the metrics.

In [None]:
cnn_model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss='mean_squared_error',
                  metrics=['mae', 'mse'])


cnn_model.summary()

## Model Training
The CNN model is trained on the preprocessed training data. The `fit` method is used to train the models with specified epochs, batch size, and verbose mode.

In [None]:
cnn_model_history = cnn_model.fit(X_train_preprocessed, y_train,
                                  validation_data=(X_valid_preprocessed, y_valid),
                    epochs=400,
                    batch_size=32,
                    verbose=2)

## Prediction 
After training, the performance of the model is evaluated on the validation set using MSE and MAE metrics.

In [None]:
y_pred = cnn_model.predict(X_valid_preprocessed)
y_pred

# Results

## Model's Prediction Performance
When the data points closely align with the regression line, it indicates that the predicted values are close to the actual values, suggesting that the model's predictions are accurate. Conversely, if the data points are scattered far from the regression line, it suggests a larger discrepancy between the predicted and actual values, indicating poorer performance of the model.

In [None]:
sns.regplot(x=y_valid, y=y_pred, line_kws={"color": "red"})
plt.title('Actual vs Predicted Salary')
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 6))

mse = mean_squared_error(y_valid, y_pred)
axs[0].plot(cnn_model_history.history['mse'], label='Training MSE')
axs[0].plot(cnn_model_history.history['val_mse'], label='Validation MSE')
axs[0].set_title(f'Mean Squared Error (MSE) - {mse:.4f}')
axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('MSE')
axs[0].legend()
axs[0].grid(True)
axs[0].legend()

mae = mean_absolute_error(y_valid, y_pred)
axs[1].plot(cnn_model_history.history['mae'], label='Training MAE')
axs[1].plot(cnn_model_history.history['val_mae'], label='Validation MAE')
axs[1].set_title(f'Mean Absolute Error (MAE) - {mae:.4f}')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Loss')
axs[1].grid(True)
axs[1].legend()

plt.subplots_adjust(top=0.4, bottom=0.0)
plt.show()