# Libraries

In [None]:
!pip install catboost
!pip install vecstack

In [None]:
from catboost import CatBoostRegressor
import pandas as pd
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.model_selection import KFold
from vecstack import StackingTransformer
import os
from xgboost import XGBRegressor

# Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
text_features = [
  "title",
  "retraining_condition",
  "responsibilities",
  "requirements_required_certificates",
  "requirements_qualifications",
  "additional_info",
"career_perspective",s
"education_requirements_speciality",
"id_hiring_organization",
"job_benefits_other_benefits",
"job_location_additional_address_info",
"job_location_address"
]

## Add text emvbeddings

In [None]:
from google.colab import drive
drive.mount('/content/drive')
TEXT_EMBEDINGS = "/content/drive/MyDrive/payroll/"
embedding_names = os.listdir(TEXT_EMBEDINGS + "svd_text_embeddings")
embedding_names

In [None]:
from tqdm import tqdm
import pickle

embedding_mappers = []

for filename in tqdm(embedding_names):
    with open(f"{TEXT_EMBEDINGS}svd_text_embeddings/{filename}", "rb") as fin:
        unique_embeddings = pickle.load(fin)
        embedding_mappers.append(unique_embeddings)
    

In [None]:
def clean_sentence(sentence):
    return ''.join(map(lambda c: c if c.isalpha() else ' ', sentence.lower()))

In [None]:
X = pd.read_csv("/content/drive/MyDrive/payroll/ready_data/X_train_encoded_2.csv", index_col=0)

In [None]:
import gc
import numpy as np

for i in range(len(embedding_names)):
    feature = embedding_names[i][:-4]
    mapper = embedding_mappers[i]
    X[feature] = X[feature].apply(clean_sentence)

    columns=[f"{feature}_{i}" for i in range(8)]

    data=[]
    for el in tqdm(X[feature]):
        if el not in mapper:
            data.append(np.array([0, 0, 0, 0, 0, 0, 0, 0]))
        else:
            data.append(np.expand_dims(mapper[el], axis=0))
            
    data = np.vstack(data)
    embeddings = pd.DataFrame(columns=columns, data=data)
    X = X.join(embeddings)
    del embeddings
    gc.collect()

In [None]:
for feature in text_features:
    del X[feature]

gc.collect()

In [None]:
test = pd.read_csv("/content/drive/MyDrive/payroll/ready_data/X_test_encoded_2.csv", index_col=0)

In [None]:
test

In [None]:
import gc
import numpy as np

for i in range(len(embedding_names)):
    feature = embedding_names[i][:-4]
    mapper = embedding_mappers[i]
    test[feature] = test[feature].apply(clean_sentence)

    columns=[f"{feature}_{i}" for i in range(8)]

    data=[]
    for el in tqdm(test[feature]):
        if el not in mapper:
            data.append(np.array([0, 0, 0, 0, 0, 0, 0, 0]))
        else:
            data.append(np.expand_dims(mapper[el], axis=0))
            
    data = np.vstack(data)
    embeddings = pd.DataFrame(columns=columns, data=data)
    test = test.join(embeddings)
    del embeddings
    gc.collect()

In [None]:
for feature in text_features:
    del test[feature]

gc.collect()

In [None]:
for column in X.columns:
    if X[column].dtype == "object":
        print(column)

In [None]:
y = pd.read_csv("/content/drive/MyDrive/payroll/ready_data/y_train.csv", index_col=0)

# Delete nan columns

In [None]:
y.isnull().sum()

In [None]:
del X["std_profession_mean_salary"]
del X["std_year_quarter_mean_salary"]

In [None]:
del test["std_profession_mean_salary"]
del test["std_year_quarter_mean_salary"]

## Choose Best features by adding a random reature

In [None]:
import os
IMPORTANCES_PATH = "/content/drive/MyDrive/payroll/feature_importances/catboost/"

folds = os.listdir(IMPORTANCES_PATH)

In [None]:
importances = [pd.read_csv(IMPORTANCES_PATH + fold) for fold in folds]
importances

In [None]:
mean_importance = importances[0]
for i in range(1, 6):
    mean_importance["Importances"] += importances[i]["Importances"]

mean_importance["Importances"] = mean_importance["Importances"] / 4

In [None]:
mean_importance

In [None]:
valid_features = []
for feature in mean_importance["Feature Id"]:
    if feature == "random":
        break
    valid_features.append(feature)

In [None]:
valid_features

In [None]:
X = X[valid_features]
test = test[valid_features]

## Add CatBoosts that predicts `base_salary_max` and `base_salary_min` в X

### Max

In [None]:
max_catboost_preds = '/content/drive/MyDrive/payroll/stacked_preds/max_catboost/'
max_models = os.listdir(max_catboost_preds)

max_names = [max_model.split("_fold_") for max_model in max_models]
max_names

In [None]:
X["max_catboost"] = np.zeros(X.shape[0])

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

kf = KFold(n_splits=6, random_state=42, shuffle=True)
kf.get_n_splits(X)

print(kf)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    for split_name in max_names:
        if split_name[1][0] == str(i):
            preds = np.load(max_catboost_preds + "_fold_".join(split_name))
            X["max_catboost"].iloc[test_index] = preds

X["max_catboost"]

### Min

In [None]:
min_catboost_preds = '/content/drive/MyDrive/payroll/stacked_preds/min_catboost/'
min_models = os.listdir(min_catboost_preds)

min_names = [min_model.split("_fold_") for min_model in min_models]
min_names

In [None]:
X["min_catboost"] = np.zeros(X.shape[0])

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

kf = KFold(n_splits=6, random_state=42, shuffle=True)
kf.get_n_splits(X)

print(kf)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    for split_name in min_names:
        if split_name[1][0] == str(i):
            preds = np.load(min_catboost_preds + "_fold_".join(split_name))
            X["min_catboost"].iloc[test_index] = preds

X["min_catboost"]

In [None]:
X

## Добавим лучший CatBoost в тест

### Max

In [None]:
max_path = '/content/drive/MyDrive/payroll/stacked_models/max_catboost/'
max_models = os.listdir(max_path)

In [None]:
max_models

In [None]:
all_preds = []
for model in tqdm(max_models):
    estimator = CatBoostRegressor(
                      iterations=35000,
                      learning_rate=0.0012,
                      depth=11,
                      l2_leaf_reg=4,
                      loss_function='MAE',
                      eval_metric="MAE",
                      verbose=1000,
                      random_strength=2,
                      task_type="GPU",
        )  
    estimator.load_model(max_path + model)

    preds = np.exp(estimator.predict(test)) - 1
    
    all_preds.append(preds)


In [None]:
all_preds

In [None]:
prediction = all_preds[0]
for i in range(1, 6):
    prediction += all_preds[i]
    
prediction = prediction / 6
prediction
test["max_catboost"] = prediction

### Min

In [None]:
min_path = '/content/drive/MyDrive/payroll/stacked_models/min_catboost/'
min_models = os.listdir(min_path)

In [None]:
min_models

In [None]:
all_preds = []
for model in tqdm(min_models):
    estimator = CatBoostRegressor(
                      iterations=35000,
                      learning_rate=0.0012,
                      depth=11,
                      l2_leaf_reg=4,
                      loss_function='MAE',
                      eval_metric="MAE",
                      verbose=1000,
                      random_strength=2,
                      task_type="GPU",
        )  
    estimator.load_model(min_path + model)

    preds = np.exp(estimator.predict(test)) - 1
    
    all_preds.append(preds)


In [None]:
all_preds

In [None]:
prediction = all_preds[0]
for i in range(1, 6):
    prediction += all_preds[i]
    
prediction = prediction / 6
prediction
test["min_catboost"] = prediction

In [None]:
test

In [None]:
gc.collect()

## Train a Meta-model (CatBoost)

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

kf = KFold(n_splits=6, random_state=42, shuffle=True)
kf.get_n_splits(X)

print(kf)
i = 0
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = CatBoostRegressor(
                            iterations=15000,
                            learning_rate=0.00065,
                            depth=11,
                            l2_leaf_reg=4,
                            grow_policy="SymmetricTree",
                            # model_size_reg=None,
                            loss_function='MAE',
                            eval_metric="MAE",
                            verbose=1000,
                            random_strength=2,
                            random_state=23,
                            task_type="GPU",
                            # objective="MSE",
    )  
    
    model.fit(X_train, np.log(y_train["mean_salary"] + 1), eval_set=(X_test, np.log(y_test["mean_salary"] + 1)), use_best_model=True)

    preds = np.exp(model.predict(X_test)) - 1
    score = mean_absolute_error(y_test["mean_salary"], preds)
    print(score)
    
    with open(f'/content/drive/MyDrive/payroll/stacked_preds/min_max_catboost_23/min_max_catboost_23_fold_{i}_{score}.npy', 'wb') as f:
        np.save(f, preds)
    model.save_model(f"/content/drive/MyDrive/payroll/stacked_models/min_max_catboost_23/min_max_catboost_23_fold_{i}_{score}.pkl")
