# Libraries

In [None]:
!pip install vecstack
!pip install lightgbm
!pip install catboost
!pip install autoxgb
!pip install optuna
!pip install category_encoders
!pip install transformers

In [None]:
# 1. Imputing
from sklearn.impute import SimpleImputer, KNNImputer

# 2. Scaling
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer

# Categorical Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders import CatBoostEncoder, CountEncoder

# Extracting embeddings from text
from transformers import AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import TfidfVectorizer

# Clustering
from sklearn.cluster import KMeans


# Feature Selection
from sklearn.feature_selection import SelectFromModel, RFE

# Dimensionality reduction
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA, NMF

# Ensembling
from sklearn.ensemble import StackingRegressor, VotingRegressor
from vecstack import StackingTransformer

# CV
from sklearn.model_selection import KFold

# Models
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from autoxgb import AutoXGB
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor


# Nets
import torch
from torch import nn
from torch import functional as F

# Utils
from sklearn.pipeline import Pipeline, FeatureUnion

# Essentials
import numpy as np
import pandas as pd

# Vizualizations
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from tqdm import tqdm

# Data

In [None]:
from google.colab import drive
drive.mount("/content/drive")
DATA = "/content/drive/MyDrive/payroll/data"

In [None]:
train = pd.read_csv(f"{DATA}/train.csv")
test = pd.read_csv(f"{DATA}/test.csv")
sample_sumbmission = pd.read_csv(f"{DATA}/sample_submission.csv")

In [None]:
train.head()

In [None]:
train.info()

In [None]:
time_features = ["date_creation", "date_posted", "date_time_posted", "time_posted"]
for category in time_features:
  print(category, "---", train[category].shape)

#Classes of features

In [None]:
id_features = [
    "id", 
    "id_hiring_organization"
]

target_features = [
    "base_salary_max", 
    "base_salary_min", 
    "base_salary", 
    "mean_salary"
]

continous_features = [
    "experience_requirements",
    "job_location_geo_latitude",
    "job_location_geo_longitude",
    "premium_size",
    "retraining_grant_value",
    "work_places"
]

binary_features = [
"accommodation_capability",
 "caring_workers",
 "date_creation_mistake",
 "disabled",
 "dms",
 "driver_licence_A",
 "driver_licence_B",
 "driver_licence_C",
 "driver_licence_D",
 "driver_licence_E",
 "is_uzbekistan_recruitment",
 "large_families",
 "minor_workers",
 "need_medcard",
 "workers_with_disabled_children",
 "vouchers_health_institutions",
 "single_parent",
 "retraining_grant",
 "requirements_id_priority_category",
 "released_persons",
 "payment_sports_activities",
 "payment_meals"
]

categorical_features = [
 "accommodation_housing",
 "drive_licences",
 "education_academic_degree",
 "education_requirements_education_type",
 "employment_type",
 "federal_district",
 "incentive_compensation_transport_compensation",
 "industry",
 "inner_info_contact_source",
 "inner_info_source_type",
 "inner_info_status",
 "job_benefits",
 "metro_station",
 "accommodation_capability",
 "work_hours",
 "source",
 "social_protecteds_social_protected",
 "region",
 "profession",
 "premium_type",
 "organization",
 "okso_code"
]

text_features = [
  "additional_info",
  "career_perspective",
  "job_benefits_other_benefits",
  "education_requirements_speciality",
  "job_location_additional_address_info",
  "job_location_address",
  "additional_info",
  "career_perspective",
  "caring_workers",
  "title",
  "retraining_condition",
  "responsibilities",
  "requirements_required_certificates",
  "requirements_qualifications"
]

time_features = [
    "date_creation", "date_posted", "date_time_posted", "time_posted"
]

# Megre train and test

In [None]:
whole_data = train.append(test)
whole_data

# Fill missing values

In [None]:
a = train.isnull().sum()
a = dict(a[a != 0] / train.shape[0])
columns = list(a.keys())
percentage = list(a.values())
sorted(zip(a, percentage), key=lambda x: x[1])

##employment_type


In [None]:
imp_mf = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
whole_data["employment_type"] = imp_mf.fit_transform(whole_data["employment_type"].to_numpy().reshape(-1, 1))
whole_data["employment_type"].isna().sum()

##payment_sports_activities

In [None]:
whole_data["payment_sports_activities"] = whole_data["payment_sports_activities"].fillna(0)
whole_data["payment_sports_activities"].isna().sum()

##single_parent

In [None]:
whole_data["single_parent"] = whole_data["single_parent"].fillna(0)
whole_data["single_parent"].isna().sum()

##industry

In [None]:
whole_data["industry"] = whole_data["industry"].fillna("")
whole_data["industry"].isna().sum()

##work_hours

In [None]:
imp_mf = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
whole_data["work_hours"] = imp_mf.fit_transform(whole_data["work_hours"].to_numpy().reshape(-1, 1))
whole_data["work_hours"].isna().sum()

##retraining_grant


In [None]:
whole_data["retraining_grant"] = whole_data["retraining_grant"].fillna(0)
whole_data["retraining_grant"].isna().sum()

##responsibilities


In [None]:
whole_data["responsibilities"] = whole_data["responsibilities"].fillna("текст отсутствует")
whole_data["responsibilities"].isna().sum()

##dms

In [None]:
whole_data["dms"] = whole_data["dms"].fillna(0)
whole_data["dms"].isna().sum()

##workers_with_disabled_children


In [None]:
whole_data["workers_with_disabled_children"] = whole_data["workers_with_disabled_children"].fillna(0)
whole_data["workers_with_disabled_children"].isna().sum()

##caring_workers

In [None]:
whole_data["caring_workers"] = whole_data["caring_workers"].fillna(0)
whole_data["caring_workers"].isna().sum()

##requirements_qualifications

In [None]:
whole_data["requirements_qualifications"] = whole_data["requirements_qualifications"].fillna("требования отсутствуют")
whole_data["requirements_qualifications"].isna().sum()

##additional_info

In [None]:
whole_data["additional_info"] = whole_data["additional_info"].fillna("информация отсутствует")
whole_data["additional_info"].isna().sum()

##job_location_additional_address_info


In [None]:
whole_data["job_location_additional_address_info"] = whole_data["job_location_additional_address_info"].fillna("данные отсутствуют")
whole_data["job_location_additional_address_info"].isna().sum()

##inner_info_contact_source

In [None]:
imp_mf = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
whole_data["inner_info_contact_source"] = imp_mf.fit_transform(whole_data["inner_info_contact_source"].to_numpy().reshape(-1, 1))
whole_data["inner_info_contact_source"].isna().sum()

##job_benefits_other_benefits

In [None]:
whole_data["job_benefits_other_benefits"] = whole_data["job_benefits_other_benefits"].fillna("отсутствуют")
whole_data["job_benefits_other_benefits"].isna().sum()

##education_requirements_education_type

In [None]:
whole_data["education_requirements_education_type"] = whole_data["education_requirements_education_type"].fillna("не требуется")
whole_data["education_requirements_education_type"].isna().sum()

## education_academic_degree

In [None]:
whole_data["education_academic_degree"] = whole_data["education_academic_degree"].fillna("Незаполнено")
whole_data["education_academic_degree"].isna().sum()

## retraining_grant_value

In [None]:
whole_data["retraining_grant_value"] = \
    whole_data["retraining_grant_value"].fillna(whole_data["retraining_grant_value"].dropna().mean())
whole_data["retraining_grant_value"].isna().sum()

## retraining_condition

In [None]:
whole_data["retraining_condition"] = whole_data["retraining_condition"].fillna("Условия отсутствует")
whole_data["retraining_condition"].isna().sum()

## job_benefits

In [None]:
whole_data["job_benefits"] = whole_data["job_benefits"].fillna("Привелегии отсутствуют")
whole_data["job_benefits"].isna().sum()

## metro_station

In [None]:
whole_data["metro_station"] = whole_data["metro_station"].fillna("Данные отсутствуют")
whole_data["metro_station"].isna().sum()

## requirements_required_certificates

In [None]:
whole_data["requirements_required_certificates"] =  whole_data["requirements_required_certificates"].fillna("требования отсутствуют")
whole_data["requirements_required_certificates"].isna().sum()

## career_perspective

In [None]:
whole_data["career_perspective"] = whole_data["career_perspective"].fillna("перспективы отсутствуют")
whole_data["career_perspective"].isna().sum()

## premium_type

In [None]:
whole_data["premium_type"] = whole_data["premium_type"].fillna("Премия не оговаривается")
whole_data["premium_type"].isna().sum()

## premium_size 

In [None]:
temp1 = whole_data[whole_data["premium_type"] == "Премия не оговаривается"]
temp2 = temp1["premium_size"].fillna(0)
temp1["premium_size"] = temp2
whole_data[whole_data["premium_type"] == "Премия не оговаривается"] = temp1
whole_data["premium_size"].isna().sum()
whole_data = whole_data[whole_data["premium_size"].isnull() == 0]

## requirements_id_priority_category

In [None]:
whole_data["requirements_id_priority_category"] = whole_data["requirements_id_priority_category"].fillna("Любые кадры")
whole_data["requirements_id_priority_category"].isna().sum()

## need_medcard

In [None]:
NEED_MEDCARD = {"Communal", "ForestAndHarvesting", "Food", 
                "Education", "Medicine", "Medicine", "Safety", 
                "Restaurants", "Transport", "SportsFitnessBeautySalons"
                "HomePersonal"}

In [None]:
mask = np.array([i in NEED_MEDCARD for i in whole_data["industry"]])
mask

In [None]:
whole_data["need_medcard"][mask & whole_data["need_medcard"].isnull()] = 1
whole_data["need_medcard"][whole_data["need_medcard"].isnull()] = 0
whole_data["need_medcard"].isna().sum()

## education_requirements_speciality

In [None]:
whole_data["education_requirements_speciality"] = whole_data["education_requirements_speciality"].fillna("Не требуется специальное образование")
whole_data["education_requirements_speciality"].isna().sum()

## incentive_compensation_transport_compensation

In [None]:
whole_data["incentive_compensation_transport_compensation"] = whole_data["incentive_compensation_transport_compensation"].fillna("NOT_PAID")
whole_data["incentive_compensation_transport_compensation"].isna().sum()

## drive_licences

In [None]:
one_hot_drive_licences = whole_data[["driver_licence_a", "driver_licence_b", "driver_licence_c", "driver_licence_d", "driver_licence_e"]]
one_hot_drive_licences

In [None]:
one_hot_drive_licences["driver_licence_a"][one_hot_drive_licences["driver_licence_a"] == 1] = "A"
one_hot_drive_licences["driver_licence_b"][one_hot_drive_licences["driver_licence_b"] == 1] = "B"
one_hot_drive_licences["driver_licence_c"][one_hot_drive_licences["driver_licence_c"] == 1] = "C"
one_hot_drive_licences["driver_licence_d"][one_hot_drive_licences["driver_licence_d"] == 1] = "D"
one_hot_drive_licences["driver_licence_e"][one_hot_drive_licences["driver_licence_e"] == 1] = "E"

one_hot_drive_licences["driver_licence_a"][one_hot_drive_licences["driver_licence_a"] == 0] = ""
one_hot_drive_licences["driver_licence_b"][one_hot_drive_licences["driver_licence_b"] == 0] = ""
one_hot_drive_licences["driver_licence_c"][one_hot_drive_licences["driver_licence_c"] == 0] = ""
one_hot_drive_licences["driver_licence_d"][one_hot_drive_licences["driver_licence_d"] == 0] = ""
one_hot_drive_licences["driver_licence_e"][one_hot_drive_licences["driver_licence_e"] == 0] = ""

whole_data["drive_licences"] = one_hot_drive_licences["driver_licence_a"] + one_hot_drive_licences["driver_licence_b"] \
                  + one_hot_drive_licences["driver_licence_c"] + one_hot_drive_licences["driver_licence_d"] \
                   + one_hot_drive_licences["driver_licence_e"] 
whole_data["drive_licences"] = whole_data["drive_licences"].apply(lambda x: "None" if x == "" else x)

## accommodation_housing

In [None]:
whole_data["accommodation_housing"] = whole_data["accommodation_housing"].fillna("NONE")
whole_data["accommodation_housing"].isna().sum()

##released_persons

In [None]:
whole_data["released_persons"] = whole_data["released_persons"].fillna(0)
whole_data["released_persons"].isna().sum()

## minor_workers

In [None]:
whole_data["minor_workers"] = whole_data["minor_workers"].fillna(0)
whole_data["minor_workers"].isna().sum()

## social_protecteds_social_protected

In [None]:
social_protecteds_social_protected = whole_data["social_protecteds_social_protected"]


In [None]:
from pandas._libs.hashtable import value_count
one_hot_protected = whole_data[["disabled", "released_persons", "single_parent", "large_families", "minor_workers", "workers_with_disabled_children", "caring_workers"]]
one_hot_protected["released_persons"].value_counts()

In [None]:
one_hot_protected.isna().sum()

In [None]:
one_hot_protected.astype(int)

In [None]:
one_hot_protected["disabled"][one_hot_protected["disabled"] == 1] = "A"
one_hot_protected["released_persons"][one_hot_protected["released_persons"] == 1] = "B"
one_hot_protected["single_parent"][one_hot_protected["single_parent"] == 1] = "C"
one_hot_protected["large_families"][one_hot_protected["large_families"] == 1] = "D"
one_hot_protected["minor_workers"][one_hot_protected["minor_workers"] == 1] = "E"
one_hot_protected["workers_with_disabled_children"][one_hot_protected["workers_with_disabled_children"] == 1] = "F"
one_hot_protected["caring_workers"][one_hot_protected["caring_workers"] == 1] = "G"

one_hot_protected["disabled"][one_hot_protected["disabled"] == 0] = ""
one_hot_protected["released_persons"][one_hot_protected["released_persons"] == 0] = ""
one_hot_protected["single_parent"][one_hot_protected["single_parent"] == 0] = ""
one_hot_protected["large_families"][one_hot_protected["large_families"] == 0] = ""
one_hot_protected["minor_workers"][one_hot_protected["minor_workers"] == 0] = ""
one_hot_protected["workers_with_disabled_children"][one_hot_protected["workers_with_disabled_children"] == 0] = ""
one_hot_protected["caring_workers"][one_hot_protected["caring_workers"] == 0] = ""


In [None]:
protected_cats = one_hot_protected["disabled"]  + one_hot_protected["released_persons"] \
                  +  one_hot_protected["single_parent"]  + one_hot_protected["large_families"] \
                  +  one_hot_protected["minor_workers"]  + one_hot_protected["workers_with_disabled_children"]\
                  +  one_hot_protected["caring_workers"]

In [None]:
whole_data["social_protecteds_social_protected"] = protected_cats

In [None]:
whole_data["social_protecteds_social_protected"] = whole_data["social_protecteds_social_protected"].apply(lambda x: "None" if x == "" else x)

In [None]:
whole_data["social_protecteds_social_protected"].isna().sum()

##experience_requierements

In [None]:
whole_data["experience_requirements"] = whole_data["experience_requirements"].fillna(0)
whole_data["experience_requirements"].isna().sum()

##accommodation_capability

In [None]:
whole_data["accommodation_capability"] = whole_data["accommodation_capability"].fillna(0)
whole_data["accommodation_capability"].isna().sum()

##is_uzbekistan_recruitment

In [None]:
whole_data["is_uzbekistan_recruitment"] = whole_data["is_uzbekistan_recruitment"].fillna(0)
whole_data["is_uzbekistan_recruitment"].isna().sum()

##job_location_geo_latitude


In [None]:
whole_data["job_location_geo_latitude"] = whole_data["job_location_geo_latitude"].fillna(whole_data["job_location_geo_latitude"].mean())
whole_data["job_location_geo_latitude"].isna().sum()

##job_location_geo_latitude

In [None]:
whole_data["job_location_geo_longitude"] = whole_data["job_location_geo_longitude"].fillna(whole_data["job_location_geo_longitude"].mean())
whole_data["job_location_geo_longitude"].isna().sum()

##okso_code


In [None]:
del whole_data["okso_code"]

##profession

In [None]:
whole_data["profession"] = whole_data["profession"].fillna(000000.0)
whole_data["profession"].isna().sum()

##oblast


In [None]:
def clean_sentence(sentence):
    return "".join(map(lambda c: c if c.isalpha() else " ", sentence.lower()))

whole_data["oblast"] = whole_data["job_location_address"].apply(lambda x: clean_sentence(" ".join(x.split(" ")[:2])).strip())

##federal_district

In [None]:
map_of_fo = {"алтайский край": 5,
 "амурская область": 4,
 "архангельская область": 3,
 "астраханская область": 2,
 "белгородская область": 1,
 "брянская область": 1,
 "владимирская область": 1,
 "волгоградская область": 2,
 "вологодская область": 3,
 "воронежская область": 1,
 "г  байконур": 9,
 "г  москва": 1,
 "г  санкт петербург": 3,
 "г  севастополь": 2,
 "еврейская автономная": 4,
 "забайкальский край": 4,
 "ивановская область": 1,
 "иркутская область": 5,
 "кабардино балкарская республика": 8,
 "калининградская область": 3,
 "калужская область": 1,
 "камчатский край": 4,
 "карачаево черкесская республика": 8,
 "кемеровская область": 5,
 "кировская область": 7,
 "костромская область": 1,
 "краснодарский край": 2,
 "красноярский край": 5,
 "курганская область": 6,
 "курская область": 1,
 "ленинградская область": 3,
 "липецкая область": 1,
 "магаданская область": 4,
 "московская область": 1,
 "мурманская область": 3,
 "ненецкий автономный": 3,
 "нижегородская область": 7,
 "новгородская область": 3,
 "новосибирская область": 5,
 "омская область": 5,
 "оренбургская область": 7,
 "орловская область": 1,
 "пензенская область": 7,
 "пермский край": 7,
 "приморский край": 4,
 "псковская область": 3,
 "республика адыгея": 2,
 "республика алтай": 5,
 "республика башкортостан": 7,
 "республика бурятия": 4,
 "республика дагестан": 8,
 "республика ингушетия": 8,
 "республика калмыкия": 2,
 "республика карелия": 3,
 "республика коми": 3,
 "республика крым": 2,
 "республика марий": 7,
 "республика мордовия": 7,
 "республика саха": 4,
 "республика северная": 8,
 "республика татарстан": 7,
 "республика тыва": 5,
 "республика хакасия": 5,
 "ростовская область": 2,
 "рязанская область": 1,
 "самарская область": 7,
 "саратовская область": 7,
 "сахалинская область": 4,
 "свердловская область": 6,
 "смоленская область": 1,
 "ставропольский край": 8,
 "тамбовская область": 1,
 "тверская область": 1,
 "томская область": 5,
 "тульская область": 1,
 "тюменская область": 6,
 "удмуртская республика": 7,
 "ульяновская область": 7,
 "хабаровский край": 4,
 "ханты мансийский автономный": 6,
 "челябинская область": 6,
 "чеченская республика": 8,
 "чувашская республика": 7,
 "чукотский автономный": 4,
 "ямало ненецкий автономный": 6,
 "ярославская область": 1}

In [None]:
whole_data["federal_district"] = whole_data["oblast"].map(map_of_fo)

##date_creation_mistake

In [None]:
del whole_data["date_creation_mistake"]

##date_posted_mistake

In [None]:
del whole_data["date_posted_mistake"]

##base_salary

In [None]:
del whole_data["base_salary"]

#Save Data

In [None]:
whole_data.info()

In [None]:
whole_data.to_csv(f"{DATA}/clean_whole_data.csv")