# Libraries

In [None]:
!pip install catboost
!pip install vecstack

In [None]:
from catboost import CatBoostRegressor
import pandas as pd
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.model_selection import KFold
from vecstack import StackingTransformer
import os

# Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
text_features = [
  "title",
  "retraining_condition",
  "responsibilities",
  "requirements_required_certificates",
  "requirements_qualifications",
  "additional_info",
"career_perspective",
"education_requirements_speciality",
"id_hiring_organization",
"job_benefits_other_benefits",
"job_location_additional_address_info",
"job_location_address"
]

In [None]:
from google.colab import drive
drive.mount('/content/drive')
TEXT_EMBEDINGS = "/content/drive/MyDrive/payroll/"
embedding_names = os.listdir(TEXT_EMBEDINGS + "svd_text_embeddings")
embedding_names

In [None]:
from tqdm import tqdm
import pickle

embedding_mappers = []

for filename in tqdm(embedding_names):
    with open(f"{TEXT_EMBEDINGS}svd_text_embeddings/{filename}", "rb") as fin:
        unique_embeddings = pickle.load(fin)
        embedding_mappers.append(unique_embeddings)
    

In [None]:
def clean_sentence(sentence):
    return ''.join(map(lambda c: c if c.isalpha() else ' ', sentence.lower()))

In [None]:
X = pd.read_csv("/content/drive/MyDrive/payroll/ready_data/X_train_encoded_2.csv", index_col=0)

In [None]:
list(X.columns)

In [None]:
import gc
import numpy as np

for i in range(len(embedding_names)):
    feature = embedding_names[i][:-4]
    mapper = embedding_mappers[i]
    X[feature] = X[feature].apply(clean_sentence)

    columns=[f"{feature}_{i}" for i in range(8)]

    data=[]
    for el in tqdm(X[feature]):
        if el not in mapper:
            data.append(np.array([0, 0, 0, 0, 0, 0, 0, 0]))
        else:
            data.append(np.expand_dims(mapper[el], axis=0))
            
    data = np.vstack(data)
    embeddings = pd.DataFrame(columns=columns, data=data)
    X = X.join(embeddings)
    del embeddings
    gc.collect()

In [None]:
for feature in text_features:
    del X[feature]

gc.collect()

In [None]:
test = pd.read_csv("/content/drive/MyDrive/payroll/ready_data/X_test_encoded_2.csv", index_col=0)

In [None]:
test

In [None]:
import gc
import numpy as np

for i in range(len(embedding_names)):
    feature = embedding_names[i][:-4]
    mapper = embedding_mappers[i]
    test[feature] = test[feature].apply(clean_sentence)

    columns=[f"{feature}_{i}" for i in range(8)]

    data=[]
    for el in tqdm(test[feature]):
        if el not in mapper:
            data.append(np.array([0, 0, 0, 0, 0, 0, 0, 0]))
        else:
            data.append(np.expand_dims(mapper[el], axis=0))
            
    data = np.vstack(data)
    embeddings = pd.DataFrame(columns=columns, data=data)
    test = test.join(embeddings)
    del embeddings
    gc.collect()

In [None]:
for feature in text_features:
    del test[feature]

gc.collect()

In [None]:
y = pd.read_csv("/content/drive/MyDrive/payroll/ready_data/y_train.csv", index_col=0)

# Delete nan features

In [None]:
for column in X.columns:
    if X[column].dtype == "object":
        print(column)

In [None]:
#del X["base_salary_max"]
del X["std_profession_mean_salary"]
del X["std_year_quarter_mean_salary"]
# del X["base_salary_min"]

In [None]:
#del test["base_salary_max"]
del test["std_profession_mean_salary"]
del test["std_year_quarter_mean_salary"]

In [None]:
gc.collect()

#feature importance on CatBoost

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest = CatBoostRegressor(
                  iterations=20000,
                  learning_rate=0.01,
                  depth=11,
                  l2_leaf_reg=4,
                  # model_size_reg=None,
                  loss_function='MAE',
                  eval_metric="MAE",
                  verbose=1000,
                  random_strength=2,
                  random_state=42,
                  task_type="GPU",
                  # objective="MSE",
        )  

In [None]:
X['random'] = np.random.rand(X.shape[0])

In [None]:
kf = KFold(n_splits=6, random_state=42, shuffle=True)
kf.get_n_splits(X)

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error

In [None]:
for i, (train_index, test_index) in enumerate(kf.split(X)):
    if i == 5:
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        forest.fit(X_train, np.log(y_train["mean_salary"] + 1), eval_set=(X_test, np.log(y_test["mean_salary"] + 1)), use_best_model=True)
        print(mean_absolute_error(y_test["mean_salary"], np.exp(forest.predict(X_test)) - 1))
        importances = forest.get_feature_importance(prettified=True)
        importances.to_csv(f"/content/drive/MyDrive/payroll/feature_importances/catboost/fold_{i}.csv", index=False)

In [None]:
import os
IMPORTANCES_PATH = "/content/drive/MyDrive/payroll/feature_importances/catboost/"

folds = os.listdir(IMPORTANCES_PATH)

In [None]:
importances = [pd.read_csv(IMPORTANCES_PATH + fold) for fold in folds]
importances

In [None]:
mean_importance = importances[0]
for i in range(1, 6):
    mean_importance["Importances"] += importances[i]["Importances"]

mean_importance["Importances"] = mean_importance["Importances"] / 4

In [None]:
mean_importance

In [None]:
valid_features = []
for feature in mean_importance["Feature Id"]:
    if feature == "random":
        break
    valid_features.append(feature)

In [None]:
valid_features