# Libraries

In [None]:
!pip install vecstack
!pip install lightgbm
!pip install catboost
!pip install autoxgb
!pip install optuna
!pip install category_encoders
!pip install transformers

In [None]:
# 1. Imputing
from sklearn.impute import SimpleImputer, KNNImputer

# 2. Scaling
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer

# Categorical Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders import CatBoostEncoder, CountEncoder

# Extracting embeddings from text
from transformers import AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import TfidfVectorizer

# Clustering
from sklearn.cluster import KMeans


# Feature Selection
from sklearn.feature_selection import SelectFromModel, RFE

# Dimensionality reduction
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA, NMF

# Ensembling
from sklearn.ensemble import StackingRegressor, VotingRegressor
from vecstack import StackingTransformer

# CV
from sklearn.model_selection import KFold

# Models
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from autoxgb import AutoXGB
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor


# Nets
import torch
from torch import nn
from torch import functional as F

# Utils
from sklearn.pipeline import Pipeline, FeatureUnion
import gc

# Essentials
import numpy as np
import pandas as pd

# Time
from dateutil import parser

# Vizualizations
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from tqdm import tqdm

# Data

In [None]:
from google.colab import drive
drive.mount("/content/drive")
DATA = "/content/drive/MyDrive/

In [None]:
whole_data_clean = pd.read_csv(f"/content/drive/MyDrive/payroll/data/clean_whole_data.csv", index_col=0)

## Fill

In [None]:
whole_data_clean["industry"] = whole_data_clean["industry"].fillna("Any")

# Classes of features

In [None]:
id_features = [
    "id", 
    "id_hiring_organization"
]

target_features = [
    "base_salary_max", 
    "base_salary_min", 
    "mean_salary"
]

continous_features = [
    "experience_requirements",
    "job_location_geo_latitude",
    "job_location_geo_longitude",
    "premium_size",
    "retraining_grant_value",
    "work_places"
]

binary_features = [
    "accommodation_capability",
    "caring_workers",
    "disabled",
    "dms",
    "driver_licence_A",
    "driver_licence_B",
    "driver_licence_C",
    "driver_licence_D",
    "driver_licence_E",
    "is_uzbekistan_recruitment",
    "large_families",
    "minor_workers",
    "need_medcard",
    "workers_with_disabled_children",
    "vouchers_health_institutions",
    "single_parent",
    "retraining_grant",
    "requirements_id_priority_category",
    "released_persons",
    "payment_sports_activities",
    "payment_meals"
]

categorical_features = [
    "accommodation_housing",
    "drive_licences",
    "education_academic_degree",
    "education_requirements_education_type",
    "employment_type",
    "federal_district",
    "incentive_compensation_transport_compensation",
    "industry",
    "inner_info_contact_source",
    "inner_info_source_type",
    "inner_info_status",
    "job_benefits",
    "metro_station",
    "work_hours",
    "source",
    "social_protecteds_social_protected",
    "oblast"
    "region",
    "profession",
    "premium_type",
    "organization",
]

text_features = [
    "additional_info",
    "career_perspective",
    "job_benefits_other_benefits",
    "education_requirements_speciality",
    "job_location_additional_address_info",
    "job_location_address",
    "additional_info",
    "title",
    "retraining_condition",
    "responsibilities",
    "requirements_required_certificates",
    "requirements_qualifications"
]

time_features = [
    "date_creation", "date_posted", "date_time_posted", "time_posted"
]

# Feature Engeneering

In [None]:
whole_data_clean.head()

## Targets

In [None]:
sns.distplot(whole_data_clean["base_salary_min"][(whole_data_clean["base_salary_min"].isnull() == 0) & (whole_data_clean["base_salary_min"] < 200000)])

## Time features

In [None]:
del whole_data_clean["date_creation"]
del whole_data_clean["date_time_posted"]
del whole_data_clean["time_posted"]

timestamp = []

for val in tqdm(whole_data_clean["date_posted"].to_numpy()):
    datetime_date = parser.parse(val)
    timestamp.append(datetime_date.timestamp())

whole_data_clean["date_creation_timestamp"] = timestamp

years = []
months = []
days = []
seasons = []
quarters = []

season_dict = {
    "01": 1,
    "02": 1, 
    "03": 1,
    "04": 2,
    "05": 2, 
    "06": 2,
    "07": 3,
    "08": 3,  
    "09": 3,
    "10": 4,
    "11": 4, 
    "12": 4,
}

quarter_dict = {
    "12": 1,
    "01": 1,
    "02": 1, 
    "03": 2,
    "04": 2,
    "05": 2, 
    "06": 3,
    "07": 3,
    "08": 3,  
    "09": 4,
    "10": 4,
    "11": 4 
    
}
for date in tqdm(whole_data_clean["date_posted"]):
  date = date.split("-")
  years.append(date[0])
  months.append(date[1])
  seasons.append(season_dict[date[1]])
  quarters.append(quarter_dict[date[1]])
  days.append(date[2])

whole_data_clean["year"] = years
whole_data_clean["month"] = months
whole_data_clean["day"] = days
whole_data_clean["season"] = seasons
whole_data_clean["quarter"] = quarters
whole_data_clean["year_quarter"] = pd.Series(years).astype("string") + "_" + pd.Series(quarters).astype("string")

In [None]:
del timestamp
del years
del months
del days
del seasons
del quarters
gc.collect()

## Continuous_features

###work places * retraining_grant_value

In [None]:
whole_data_clean["work_places * retraining_grant_value"] = whole_data_clean["work_places"] * whole_data_clean["retraining_grant_value"]

###upper_year_premium_size

In [None]:
wo_0 = whole_data_clean[whole_data_clean["premium_size"]>0] 

In [None]:
temp1 = wo_0[wo_0["premium_size"] < 400]
temp2 = temp1["premium_size"]*1000
temp1["premium_size"] = temp2
wo_0[wo_0["premium_size"] < 400] = temp1
whole_data_clean[whole_data_clean["premium_size"]>0] = wo_0

###year_premium_size

In [None]:
whole_data_clean["year_premium_size"] = 0

In [None]:
whole_data_clean["premium_type"].unique()

In [None]:
temp1 = whole_data_clean[whole_data_clean["premium_type"] == "Ежемесячная премия"]
temp2 = temp1["premium_size"]*12
temp1["year_premium_size"] = temp2
whole_data_clean[whole_data_clean["premium_type"] == "Ежемесячная премия"] = temp1

In [None]:
whole_data_clean[whole_data_clean["premium_type"] == "Ежемесячная премия"]

In [None]:
temp1 = whole_data_clean[whole_data_clean["premium_type"] == "Ежеквартальная премия"]
temp2 = temp1["premium_size"] * 4
temp1["year_premium_size"] = temp2
whole_data_clean[whole_data_clean["premium_type"] == "Ежеквартальная премия"] = temp1

In [None]:
temp1 = whole_data_clean[whole_data_clean["premium_type"] == "Ежегодная премия"]
temp2 = temp1["premium_size"] * 1
temp1["year_premium_size"] = temp2
whole_data_clean[whole_data_clean["premium_type"] == "Ежегодная премия"] = temp1

In [None]:
whole_data_clean[whole_data_clean["premium_size"] < 400]["premium_size"].value_counts()

###work_places * year_premium_size

In [None]:
whole_data_clean["work_places * year_premium_size"] = whole_data_clean["work_places"] * whole_data_clean["year_premium_size"]

###work_places * year_premium_size + work places * retraining_grant_value

In [None]:
whole_data_clean["wpypz_wprgv"] = whole_data_clean["work_places * year_premium_size"] + whole_data_clean["work_places * retraining_grant_value"]

## Cat Features

### education_academic_degree + education_requirements_education_type



In [None]:
whole_data_clean["education_academic_degree"]

In [None]:
whole_data_clean["education_requirements_education_type"].unique()

In [None]:
whole_data_clean["education_academic_degree_education_requirements_education_type"] = whole_data_clean["education_academic_degree"] \
+ "_" + whole_data_clean["education_requirements_education_type"]
whole_data_clean["education_academic_degree_education_requirements_education_type"].unique()

### region + industry

In [None]:
whole_data_clean["region"] = whole_data_clean["region"] / 100000000000
whole_data_clean["region"] = whole_data_clean["region"].astype("int")

In [None]:
whole_data_clean["region_industry"] = whole_data_clean["region"].astype("string") \
+ "_" + whole_data_clean["industry"]
whole_data_clean["region_industry"].unique()

### premium_type + work_hours

In [None]:
whole_data_clean["premium_type_work_hours"] = whole_data_clean["premium_type"] \
+ "_" + whole_data_clean["work_hours"]
whole_data_clean["premium_type_work_hours"].unique()

### delete oblast

In [None]:
del whole_data_clean["oblast"]
gc.collect()

### inner_info_source_type + source

In [None]:
whole_data_clean["inner_info_source_type_source"] = whole_data_clean["inner_info_source_type"] \
+ "_" + whole_data_clean["source"]
whole_data_clean["inner_info_source_type_source"].unique()

### industry + profession

In [None]:
whole_data_clean["industry_profession"] = whole_data_clean["industry"] \
+ "_" + whole_data_clean["profession"].astype("int").astype("string")
whole_data_clean["industry_profession"].unique()

### region + profession

In [None]:
whole_data_clean["region_profession"] = whole_data_clean["region"].astype("string") \
+ "_" + whole_data_clean["profession"].astype("int").astype("string")
whole_data_clean["region_profession"].unique()

## Target Encoding 




In [None]:
train = whole_data_clean[whole_data_clean["mean_salary"].isnull() == 0]
test = whole_data_clean[whole_data_clean["mean_salary"].isnull()]

In [None]:
whole_data_clean["industry"].isnull().sum()

In [None]:
for el in train["work_hours"].unique():
    print(el, train[train["work_hours"] == el].mean_salary.std())

In [None]:
values = ["year_quarter",
          "accommodation_housing", "education_academic_degree", 
          "education_requirements_education_type", "employment_type", 
          "region", "industry", "profession", "work_hours", "job_benefits", "source"

]
for value in tqdm(values):
    mean_salary = train.groupby([value])["mean_salary"].mean().to_frame().rename(columns={"mean_salary": f"mean_{value}_mean_salary"}).reset_index()
    train = train.merge(mean_salary, how="left", on=[value])
    test = test.merge(mean_salary, how="left", on=[value])


In [None]:
values = [
          "accommodation_housing", "education_academic_degree", 
          "education_requirements_education_type", "employment_type", 
          "region", "industry", "profession", "work_hours", "job_benefits", "source", "year_quarter"

]
for value in tqdm(values):
    mean_salary = train.groupby([value])["mean_salary"].std().to_frame().rename(columns={"mean_salary": f"std_{value}_mean_salary"}).reset_index()
    train = train.merge(mean_salary, how="left", on=[value])
    test = test.merge(mean_salary, how="left", on=[value])


In [None]:
values = [
          "accommodation_housing", "education_academic_degree", 
          "education_requirements_education_type", "employment_type", 
          "region", "industry", "profession", "work_hours", "job_benefits", "source", "year_quarter"

]
for value in tqdm(values):
    mean_salary = train.groupby([value])["mean_salary"].median().to_frame().rename(columns={"mean_salary": f"median_{value}_mean_salary"}).reset_index()
    train = train.merge(mean_salary, how="left", on=[value])
    test = test.merge(mean_salary, how="left", on=[value])


##catboost encoding

In [None]:
id_features = [
    "id", 
    "id_hiring_organization"
]

target_features = [
    "base_salary_max", 
    "base_salary_min", 
    "mean_salary"
]

continous_features = [
    "experience_requirements",
    "job_location_geo_latitude",
    "job_location_geo_longitude",
    "premium_size",
    "retraining_grant_value",
    "work_places",
    "date_creation_timestamp",
    "work_places * retraining_grant_value",
    "year_premium_size",
    "work_places * year_premium_size",
    "wpypz_wprgv"
]

binary_features = [
"accommodation_capability",
 "caring_workers",
 "disabled",
 "dms",
 "driver_licence_a",
 "driver_licence_b",
 "driver_licence_c",
 "driver_licence_d",
 "driver_licence_e",
 "is_uzbekistan_recruitment",
 "large_families",
 "minor_workers",
 "need_medcard",
 "workers_with_disabled_children",
 "vouchers_health_institutions",
 "single_parent",
 "retraining_grant",
 "requirements_id_priority_category",
 "released_persons",
 "payment_sports_activities",
 "payment_meals"
]

categorical_features = [
 "accommodation_housing",
 "drive_licences",
 "education_academic_degree",
 "education_requirements_education_type",
 "employment_type",
 "federal_district",
 "incentive_compensation_transport_compensation",
 "industry",
 "inner_info_contact_source",
 "inner_info_source_type",
 "inner_info_status",
 "job_benefits",
 "metro_station",
 "work_hours",
 "source",
 "social_protecteds_social_protected",
 "region",
 "profession",
 "premium_type",
 "organization",
  "year",
  "month",
  "day",
  "season",
  "quarter",
  "education_academic_degree_education_requirements_education_type",
  "region_industry",
  "premium_type_work_hours",
  "inner_info_source_type_source",
  "industry_profession",
  "region_profession",
  "date_posted",
  "year_quarter"
]

text_features = [
  "additional_info",
  "career_perspective",
  "job_benefits_other_benefits",
  "education_requirements_speciality",
  "job_location_additional_address_info",
  "job_location_address",
  "additional_info",
  "title",
  "retraining_condition",
  "responsibilities",
  "requirements_required_certificates",
  "requirements_qualifications"
]

In [None]:
all_cat_features = categorical_features + binary_features

In [None]:
X_train, y_train = train.drop(labels=["base_salary_min", "base_salary_max", "mean_salary"], axis=1), train["mean_salary"]
X_test = test.drop(labels=["base_salary_max",	"base_salary_min", "mean_salary"], axis=1)

In [None]:
set(X_train.columns) - set(X_test.columns)

In [None]:
len(X_test.columns), len(X_train.columns)

In [None]:
for feature in X_train.columns:
    if "driver" in feature:
        print(feature)

In [None]:
X_train.to_csv("/content/drive/MyDrive/payroll/ready_data/train.csv")

In [None]:
del train
del test 
gc.collect()

In [None]:
from category_encoders.cat_boost import CatBoostEncoder

cbe = CatBoostEncoder(verbose=1, cols=all_cat_features)
X_train_encoded = cbe.fit_transform(X_train, y_train)

In [None]:
X_test_encoded = cbe.transform(X_test)

In [None]:
X_test_encoded

In [None]:
X_train_encoded.to_csv("/content/drive/MyDrive/payroll/ready_data/X_train_encoded.csv")
X_test_encoded.to_csv("/content/drive/MyDrive/payroll/ready_data/X_test_encoded.csv")


# Second Round Feature Engeneering

In [None]:
from google.colab import drive
drive.mount("/content/drive")
ROOT = "/content/drive/MyDrive/payroll/"

In [None]:
import pandas as pd
X_train = pd.read_csv(f"{ROOT}ready_data/train.csv", index_col=0)
X_test = pd.read_csv(f"{ROOT}ready_data/test.csv", index_col=0)
y = pd.read_csv(f"{ROOT}ready_data/y_train.csv", index_col=0)

In [None]:
X_train

In [None]:
X_train["year_month"] = X_train["year"].astype("string") + "_" + X_train["month"].astype("string")
X_test["year_month"] = X_test["year"].astype("string") + "_" + X_test["month"].astype("string")

In [None]:
X_train["year_season"] = X_train["year"].astype("string") + "_" + X_train["season"].astype("string")
X_test["year_season"] = X_test["year"].astype("string") + "_" + X_test["season"].astype("string")

In [None]:
X_train["federal_district_profession"] = X_train["federal_district"].astype("string") + "_" + X_train["profession"].astype("string")
X_test["federal_district_profession"] = X_test["federal_district"].astype("string") + "_" + X_test["profession"].astype("string")

In [None]:
X_train["education_requirements_education_type_industry"] = X_train["education_requirements_education_type"].astype("string") + "_" + X_train["industry"].astype("string")
X_test["education_requirements_education_type_industry"] = X_test["education_requirements_education_type"].astype("string") + "_" + X_test["industry"].astype("string")

In [None]:
X_train["education_requirements_education_type_profession"] = X_train["education_requirements_education_type"].astype("string") + "_" + X_train["profession"].astype("string")
X_test["education_requirements_education_type_profession"] = X_test["education_requirements_education_type"].astype("string") + "_" + X_test["profession"].astype("string")

## More Target Encoding

In [None]:
from tqdm import tqdm

In [None]:
X_train["mean_salary"] = y["mean_salary"]

In [None]:
values = [
          "accommodation_housing", "education_academic_degree", 
          "education_requirements_education_type", "employment_type", 
          "region", "industry", "profession", "work_hours", "job_benefits", "source", "year_quarter",
          "year_month", "year_season", "federal_district_profession", "education_requirements_education_type_industry",
          "education_requirements_education_type_profession"

]
for value in tqdm(values):
    mean_salary = X_train.groupby([value])["mean_salary"].max().to_frame().rename(columns={"mean_salary": f"max_{value}_mean_salary"}).reset_index()
    X_train = X_train.merge(mean_salary, how="left", on=[value])
    X_test = X_test.merge(mean_salary, how="left", on=[value])


In [None]:
import gc
gc.collect()

In [None]:
values = [
          "accommodation_housing", "education_academic_degree", 
          "education_requirements_education_type", "employment_type", 
          "region", "industry", "profession", "work_hours", "job_benefits", "source", "year_quarter",
          "year_month", "year_season", "federal_district_profession", "education_requirements_education_type_industry",
          "education_requirements_education_type_profession"
]
for value in tqdm(values):
    mean_salary = X_train.groupby([value])["mean_salary"].min().to_frame().rename(columns={"mean_salary": f"min_{value}_mean_salary"}).reset_index()
    X_train = X_train.merge(mean_salary, how="left", on=[value])
    X_test = X_test.merge(mean_salary, how="left", on=[value])

In [None]:
for column in X_train.columns:
    if "max" in column:
        print(column)

In [None]:
values = [
          "accommodation_housing", "education_academic_degree", 
          "education_requirements_education_type", "employment_type", 
          "region", "industry", "profession", "work_hours", "job_benefits", "source", "year_quarter",
          "year_month", "year_season", "federal_district_profession", "education_requirements_education_type_industry",
          "education_requirements_education_type_profession"
]
for value in tqdm(values):
    X_train[f"amplitude_{value}_mean_salary"] = X_train[f"max_{value}_mean_salary"] - X_train[f"min_{value}_mean_salary"]
    X_test[f"amplitude_{value}_mean_salary"] = X_test[f"max_{value}_mean_salary"] - X_test[f"min_{value}_mean_salary"]


In [None]:
new_values  = set([
          "accommodation_housing", "education_academic_degree", 
          "education_requirements_education_type", "employment_type", 
          "region", "industry", "profession", "work_hours", "job_benefits", "source", "year_quarter",
          "year_month", "year_season", "federal_district_profession", "education_requirements_education_type_industry",
          "education_requirements_education_type_profession"

]) - set([
          "accommodation_housing", "education_academic_degree", 
          "education_requirements_education_type", "employment_type", 
          "region", "industry", "profession", "work_hours", "job_benefits", "source", "year_quarter"
])

In [None]:
new_values

In [None]:
for value in tqdm(new_values):
    mean_salary = X_train.groupby([value])["mean_salary"].mean().to_frame().rename(columns={"mean_salary": f"mean_{value}_mean_salary"}).reset_index()
    X_train = X_train.merge(mean_salary, how="left", on=[value])
    X_test = X_test.merge(mean_salary, how="left", on=[value])


In [None]:
for value in tqdm(new_values):
    mean_salary = X_train.groupby([value])["mean_salary"].std().to_frame().rename(columns={"mean_salary": f"std_{value}_mean_salary"}).reset_index()
    X_train = X_train.merge(mean_salary, how="left", on=[value])
    X_test = X_test.merge(mean_salary, how="left", on=[value])


In [None]:
for value in tqdm(new_values):
    mean_salary = X_train.groupby([value])["mean_salary"].median().to_frame().rename(columns={"mean_salary": f"median_{value}_mean_salary"}).reset_index()
    X_train = X_train.merge(mean_salary, how="left", on=[value])
    X_test = X_test.merge(mean_salary, how="left", on=[value])


In [None]:
X_train

In [None]:
X_train.to_csv(f"{ROOT}ready_data/train_2.csv")
X_test.to_csv(f"{ROOT}ready_data/test_2.csv")

## Second encoding

In [None]:
all_cat_features.extend(list(new_values))

In [None]:
del X_train["mean_salary"]
gc.collect()

In [None]:
!pip install category-encoders
from category_encoders.cat_boost import CatBoostEncoder

cbe = CatBoostEncoder(verbose=1, cols=all_cat_features)
X_train_encoded = cbe.fit_transform(X_train, y["mean_salary"])

In [None]:
X_test_encoded = cbe.transform(X_test)

In [None]:
X_train_encoded.to_csv(f"{ROOT}/ready_data/X_train_encoded_2.csv")
X_test_encoded.to_csv(f"{ROOT}/ready_data/X_test_encoded_2.csv")