## 1. Import Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from category_encoders import CountEncoder
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,  roc_auc_score

import os
import joblib

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

random_state = 4012

## 2. Load Data

In [3]:
X_train = pd.read_csv("../Processed Data Files/X_train.csv")
y_train = pd.read_csv("../Processed Data Files/y_train.csv")

## 3. Model

In [8]:
# Combine them!

scaler = StandardScaler()

onehot = OneHotEncoder(
    categories = [
            ['full time', 'contract', 'part time','flexi','other','unspecified'], # employment_type
            ['entry level', 'middle level', 'senior level', 'unspecified'], # required_experience
            ['high school or vocational degree', 'undergraduate', 'graduate', 'unspecified'], # required_education
    ],
    handle_unknown = 'ignore',  # <- Ignore unknown values (i.e. don't create a column for them)
)

freq_encoder = CountEncoder()

binary_columns = ['telecommuting', 'has_company_logo','has_questions', 'have_company_profile', 'have_requirements', 'have_benefits', 'have_category', 'high_salary']
numerical_columns = ['flesch_score_bin_ft','fk_grade_level_bin_ft', 'text_len']
onehot_columns = ['employment_type', 'required_experience','required_education']
freq_columns = ['location_country']

preprocessor = ColumnTransformer([
    # (nickname, transformer to apply, columns to apply to)
    ('binary', 'passthrough', binary_columns),  # <- 'passthrough' says to keep them but don't apply anything
    ('numerical', scaler, numerical_columns),   
    ('onehot', onehot, onehot_columns),
    ('frequency',freq_encoder, freq_columns),
],  remainder='passthrough')

model = Pipeline(steps=[
    ('preprocess', preprocessor),
    # ('resampling', SMOTEENN(random_state=random_state,enn=EditedNearestNeighbours(sampling_strategy='majority'))),
    ('train', CatBoostClassifier(random_state=random_state)),
])

params = dict(
    train__n_estimators = [100, 200, 300],
    train__learning_rate = [0.1, 0.3, 0.5],
    train__max_depth = [4, 6, 8],
    train__l2_leaf_reg = [1, 5, 10],
    train__border_count = [32, 64, 128],
    train__loss_function = ['Logloss', 'CrossEntropy'],
    train__auto_class_weights = ['Balanced']
)


def train(x_train, y_train, model, params):
    
    gridsearchcv = GridSearchCV(model, params, cv=3, scoring='f1_weighted', verbose=True, n_jobs=-1)
    #gridsearchcv = RandomizedSearchCV(model, params, cv=5, scoring='f1_weighted', verbose=3, n_jobs=-1)
    gridsearchcv.fit(x_train, y_train.values)
    best_model = model.set_params(**gridsearchcv.best_params_).fit(x_train, y_train.values)
    
    return best_model


best_model = train(X_train, y_train, model, params)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
0:	learn: 0.6208371	total: 189ms	remaining: 56.5s
1:	learn: 0.5586558	total: 228ms	remaining: 33.9s
2:	learn: 0.5116417	total: 265ms	remaining: 26.2s
3:	learn: 0.4695343	total: 302ms	remaining: 22.3s
4:	learn: 0.4310500	total: 340ms	remaining: 20.1s
5:	learn: 0.4024712	total: 382ms	remaining: 18.7s
6:	learn: 0.3822851	total: 418ms	remaining: 17.5s
7:	learn: 0.3658437	total: 461ms	remaining: 16.8s
8:	learn: 0.3531032	total: 502ms	remaining: 16.2s
9:	learn: 0.3439690	total: 543ms	remaining: 15.7s
10:	learn: 0.3324248	total: 600ms	remaining: 15.8s
11:	learn: 0.3238978	total: 640ms	remaining: 15.4s
12:	learn: 0.3131353	total: 683ms	remaining: 15.1s
13:	learn: 0.3073593	total: 721ms	remaining: 14.7s
14:	learn: 0.3008360	total: 761ms	remaining: 14.5s
15:	learn: 0.2916470	total: 805ms	remaining: 14.3s
16:	learn: 0.2856854	total: 845ms	remaining: 14.1s
17:	learn: 0.2790665	total: 886ms	remaining: 13.9s
18:	learn: 0.2739217	total: 926

In [9]:
base_path = "../Models/" # path where the file is going to be saved in 

model_name = 'catboost'

model_path = os.path.join(base_path, f'{model_name}.joblib')

joblib.dump(best_model, model_path)

['../Models/catboost.joblib']