## 1. Import Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from category_encoders import CountEncoder
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
%pip install catboost
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,  roc_auc_score

import os
import joblib

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

random_state = 4012

^C
Note: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'catboost'

Collecting catboost
  Downloading catboost-1.2.2-cp311-cp311-win_amd64.whl (101.0 MB)
     ---------------------------------------- 0.0/101.0 MB ? eta -:--:--
     ---------------------------------------- 0.4/101.0 MB 7.6 MB/s eta 0:00:14
      -------------------------------------- 1.3/101.0 MB 13.8 MB/s eta 0:00:08
     - ------------------------------------- 2.8/101.0 MB 22.0 MB/s eta 0:00:05
     -- ------------------------------------ 5.8/101.0 MB 28.7 MB/s eta 0:00:04
     -- ------------------------------------ 7.6/101.0 MB 30.4 MB/s eta 0:00:04
     --- ---------------------------------- 10.6/101.0 MB 40.9 MB/s eta 0:00:03
     ----- -------------------------------- 14.8/101.0 MB 54.7 MB/s eta 0:00:02
     ------ ------------------------------- 16.4/101.0 MB 50.4 MB/s eta 0:00:02
     ------- ------------------------------ 20.1/101.0 MB 59.5 MB/s eta 0:00:02
     -------- ----------------------------- 22.6/101.0 MB 59.5 MB/s eta 0:00:02
     -------- ---------------------------


[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## 2. Load Data

In [None]:
X_train = pd.read_csv("../Processed Data Files/X_train.csv")
y_train = pd.read_csv("../Processed Data Files/y_train.csv")

## 3. Model

In [1]:
# Combine them!

scaler = StandardScaler()

onehot = OneHotEncoder(
    categories = [
            ['full time', 'contract', 'part time','flexi','other','unspecified'], # employment_type
            ['entry level', 'middle level', 'senior level', 'unspecified'], # required_experience
            ['high school or vocational degree', 'undergraduate', 'graduate', 'unspecified'], # required_education
    ],
    handle_unknown = 'ignore',  # <- Ignore unknown values (i.e. don't create a column for them)
)

freq_encoder = CountEncoder()

binary_columns = ['telecommuting', 'has_company_logo','has_questions', 'have_company_profile', 'have_requirements', 'have_benefits', 'have_category', 'high_salary']
numerical_columns = ['flesch_score_bin_ft','fk_grade_level_bin_ft', 'text_len']
onehot_columns = ['employment_type', 'required_experience','required_education']
freq_columns = ['location_country']

preprocessor = ColumnTransformer([
    # (nickname, transformer to apply, columns to apply to)
    ('binary', 'passthrough', binary_columns),  # <- 'passthrough' says to keep them but don't apply anything
    ('numerical', scaler, numerical_columns),   
    ('onehot', onehot, onehot_columns),
    ('frequency',freq_encoder, freq_columns),
],  remainder='passthrough')

model = Pipeline(steps=[
    ('preprocess', preprocessor),
    # ('resampling', SMOTEENN(random_state=random_state,enn=EditedNearestNeighbours(sampling_strategy='majority'))),
    ('train', CatBoostClassifier(random_state=random_state)),
])

params = dict(
    train__n_estimators = [100, 200, 300],
    train__learning_rate = [0.1, 0.3, 0.5],
    train__max_depth = [4, 6, 8],
    train__l2_leaf_reg = [1, 5, 10],
    train__border_count = [0.1, 0.3, 0.5],
    train__loss_function = ['Logloss', 'CrossEntropy'],
    train__auto_class_weights = ['Balanced']
)


def train(x_train,y_train,model,params):
    
    gridsearchcv = GridSearchCV(model, params, cv=5, scoring='f1_weighted', verbose=3, n_jobs=-1)
    gridsearchcv.fit(x_train, y_train.values)
    best_model = model.set_params(**gridsearchcv.best_params_).fit(x_train, y_train.values)
    
    return best_model


best_model = train(X_train, y_train, model, params)


NameError: name 'StandardScaler' is not defined

In [19]:
base_path = "../Models/" # path where the file is going to be saved in 

model_name = 'catboost'

model_path = os.path.join(base_path, f'{model_name}.joblib')

joblib.dump(best_model, model_path)

['../Models/decision_tree.joblib']