In [None]:
# import
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder

# **--------------------------PREPARE THE DATA--------------------------**

1. **Load df**

In [None]:
import io
df_admin = pd.read_excel('/content/Cleaned_NZ_Admin_JOBS.xlsx')
df_banking = pd.read_excel('/content/Cleaned_NZ_Banking_JOBS.xlsx')
df_ceo = pd.read_excel('/content/Cleaned_NZ_CEO_JOBS.xlsx')
# Dataset is now stored in a Pandas Dataframe

2. **Add domain for regression**

In [None]:
df_admin['Domain'] = 'Admin'
df_banking['Domain'] = 'Banking'
df_ceo['Domain'] = 'CEO'

In [None]:
# Combine three dataframes
df_all = pd.concat([df_admin, df_banking, df_ceo], ignore_index=True, sort=False)
df_all = df_all[["Job", "Company", "Region", "City", "Lowest Salary", "Higest Salary", "Posted Date (Days Ago)", "Domain"]]
df_all = df_all.dropna()
df_all.head(5)

Unnamed: 0,Job,Company,Region,City,Lowest Salary,Higest Salary,Posted Date (Days Ago),Domain
0,Administrator,Private Advertiser,Bay of Plenty,Tauranga,44469,49720,0,Admin
1,Receptionist,Avenues Orthodontics,Bay of Plenty,Tauranga,45928,46805,0,Admin
2,Prosecutions Support Officer,New Zealand Police,Auckland,NO DATA,38776,44341,4,Admin
3,Early Childhood Centre Administrator,Kew Pacific Island Early Learning Centre,Southland,Invercargill,54903,54961,0,Admin
4,Business Support Administrator,Private Advertiser,Canterbury,Christchurch,50095,50788,4,Admin


3. **Set y_df to lower_salary (regression goal)**

In [None]:
y_df = df_all['Lowest Salary']
y_df_encoded = LabelEncoder().fit_transform(y_df.values)

4. **Encode x_label** 

In [None]:
x_df = df_all.apply(LabelEncoder().fit_transform)
x_df.head(5)

Unnamed: 0,Job,Company,Region,City,Lowest Salary,Higest Salary,Posted Date (Days Ago),Domain
0,49,411,1,34,1169,984,0,0
1,647,47,1,34,1368,584,0,0
2,635,361,0,10,492,359,4,0
3,304,282,9,8,2435,2304,0,0
4,132,411,2,2,1887,1161,4,0


5. **Perform train-test split**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=0)

# **--------------------------EVALUATION FUNCTIONS---------------------**

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, x_df, y_df, cv=5, scoring='r2')
    return pred

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

# **-----------------------------REGRESSION WITH XGBOOST-----------------------------**

### **Define a XGBOOST regressor**

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

xgb_model = xgb.XGBRegressor()

params = {
    "learning_rate": uniform(0.01, 0.3), 
    "max_depth": randint(2, 6),
    "n_estimators": randint(100, 1000), 
    "subsample": uniform(0.6, 1)
}

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=12, n_iter=200, cv=3, verbose=1, n_jobs=10, return_train_score=False)

search.fit(x_df, y_df_encoded)

search.cv_results_

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   11.8s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  1.7min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  3.6min
[Parallel(n_jobs=10)]: Done 600 out of 600 | elapsed:  5.4min finished




{'mean_fit_time': array([8.55018028e+00, 8.74795914e-02, 1.02939129e-01, 4.97353872e-02,
        8.16282423e+00, 2.90916761e-02, 2.45021184e-02, 1.55784276e+01,
        1.81858540e-02, 2.83873081e-02, 1.01936402e+01, 2.56142616e-02,
        2.78911591e-02, 2.78515021e-02, 1.84773604e-02, 2.34005020e+01,
        4.15508223e+00, 1.46121874e+01, 9.56305170e+00, 1.49386447e+01,
        1.21495393e+01, 2.22650369e-02, 1.46865845e-02, 1.26350859e+01,
        2.85478433e-02, 1.97610855e-02, 1.05671899e+01, 2.83073584e-02,
        6.61967580e+00, 2.65688896e-02, 2.73825328e-02, 3.08005810e-02,
        2.72929668e-02, 2.78191566e-02, 1.55101293e+01, 1.75764403e+01,
        2.70187855e-02, 2.79378096e-02, 6.30962809e+00, 1.95638084e+01,
        2.65362556e+01, 6.33363040e+00, 2.70428658e-02, 7.52412478e+00,
        1.29882324e+01, 8.55886102e+00, 2.29095052e+01, 2.67393589e-02,
        2.72049904e-02, 2.48877207e-02, 2.58673032e-02, 2.42065589e-02,
        2.46316592e-02, 2.71044572e-02, 2.73285

**Print the top 5 results**

In [None]:
import numpy as np
results = search.cv_results_
for i in range(1, 5):
  candidates = np.flatnonzero(results['rank_test_score'] == i)
  for candidate in candidates:
      print("Model with rank: {0}".format(i))
      print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
            results['mean_test_score'][candidate],
            results['std_test_score'][candidate]))
      print("Parameters: {0}".format(results['params'][candidate]))

Model with rank: 1
Mean validation score: 0.987 (std: 0.014)
Parameters: {'learning_rate': 0.0989510082720107, 'max_depth': 4, 'n_estimators': 333, 'subsample': 0.6471626631365215}
Model with rank: 2
Mean validation score: 0.987 (std: 0.014)
Parameters: {'learning_rate': 0.30447626640809106, 'max_depth': 4, 'n_estimators': 718, 'subsample': 0.626287019886786}
Model with rank: 3
Mean validation score: 0.987 (std: 0.014)
Parameters: {'learning_rate': 0.16697382123348756, 'max_depth': 4, 'n_estimators': 586, 'subsample': 0.6482187498255704}
Model with rank: 4
Mean validation score: 0.987 (std: 0.014)
Parameters: {'learning_rate': 0.2484051354086793, 'max_depth': 4, 'n_estimators': 538, 'subsample': 0.6515952373879789}


### **Print the best params**

In [None]:
params_best = search.best_params_
params_best

{'learning_rate': 0.0989510082720107,
 'max_depth': 4,
 'n_estimators': 333,
 'subsample': 0.6471626631365215}

### **Load the best params and print the results**

In [None]:
params = {'learning_rate': 0.0989510082720107,
          'max_depth': 4,
          'n_estimators': 333,
          'subsample': 0.6471626631365215}

xgb_model = xgb.XGBRegressor(**params)
xgb_model.fit(X_train, y_train)

text_pred = xgb_model.predict(X_test)
train_pred = xgb_model.predict(X_train)


print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, text_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 180.9494020464025
MSE: 290654.3473780277
RMSE: 539.1236846754441
R2 Square 0.9999151164550641
Train set evaluation:
_____________________________________
MAE: 146.89486534387223
MSE: 46529.69595010651
RMSE: 215.70743137431893
R2 Square 0.9999864986342818
