# Libraries

In [None]:
!pip install vecstack
!pip install lightgbm
!pip install catboost
!pip install autoxgb
!pip install optuna
!pip install category_encoders
!pip install transformers

In [None]:
# 1. Imputing
from sklearn.impute import SimpleImputer, KNNImputer

# 2. Scaling
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer

# Categorical Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders import CatBoostEncoder, CountEncoder

# Extracting embeddings from text
from transformers import AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import TfidfVectorizer

# Clustering
from sklearn.cluster import KMeans


# Feature Selection
from sklearn.feature_selection import SelectFromModel, RFE

# Dimensionality reduction
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA, NMF

# Ensembling
from sklearn.ensemble import StackingRegressor, VotingRegressor
from vecstack import StackingTransformer

# CV
from sklearn.model_selection import KFold

# Models
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from autoxgb import AutoXGB
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor


# Nets
import torch
from torch import nn
from torch import functional as F

# Utils
from sklearn.pipeline import Pipeline, FeatureUnion

# Essentials
import numpy as np
import pandas as pd

# Vizualizations
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from tqdm import tqdm

# Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
DATA = "/content/drive/MyDrive/payroll/data"

train = pd.read_csv(f'{DATA}/train.csv')
test = pd.read_csv(f'{DATA}/test.csv')
sample_sumbmission = pd.read_csv(f'{DATA}/sample_submission.csv')

In [None]:
train

# Feature types

In [None]:
id_features = [
    "id"
]

target_features = [
  "base_salary_max", 
  "base_salary_min", 
  "base_salary", 
  "mean_salary"
]

continous_features = [
  'experience_requirements',
  'premium_size',
  'retraining_grant_value',
  'work_places'
]

binary_features = [
 'accommodation_capability',
 'caring_workers',
 'date_creation_mistake',
 'disabled',
 'dms',
 'driver_licence_A',
 'driver_licence_B',
 'driver_licence_C',
 'driver_licence_D',
 'driver_licence_E',
 'is_uzbekistan_recruitment',
 'large_families',
 'minor_workers',
 'need_medcard',
 'workers_with_disabled_children',
 'vouchers_health_institutions',
 'single_parent',
 'retraining_grant',
 'requirements_id_priority_category',
 'released_persons',
 'payment_sports_activities',
 'payment_meals',
]

categorical_features = [
 'accommodation_housing',
 'drive_licences',
 'education_academic_degree',
 'education_requirements_education_type',
 'employment_type',
 'federal_district',
 'incentive_compensation_transport_compensation',
 'industry',
 'inner_info_contact_source',
 'inner_info_source_type',
 'inner_info_status',
 'job_benefits',
 'metro_station',
 'accommodation_capability',
 'work_hours',
 'source',
 'social_protecteds_social_protected',
 'region',
 'profession',
 'premium_type',
 'organization',
 'okso_code',
 "id_hiring_organization"
]

text_features = [
  'additional_info',
  'career_perspective',
  'job_benefits_other_benefits',
  'education_requirements_speciality',
  'additional_info',
  'career_perspective',
  'title',
  'retraining_condition',
  'responsibilities',
  'requirements_required_certificates',
  'requirements_qualifications'
]

geo_features = [
    'job_location_additional_address_info',
    'job_location_address',        
    'job_location_geo_latitude',
    'job_location_geo_longitude'
]

time_features = [
    "date_creation", "date_posted", "date_time_posted", "time_posted"
]

# Megre train and test

In [None]:
whole_data = train.append(test)
whole_data

In [None]:
del train
del test
import gc
gc.collect()

# Fill empty Text Data

In [None]:
text_data = whole_data[text_features]


In [None]:
del whole_data
gc.collect()

##responsibilities


In [None]:
text_data['responsibilities'] = text_data['responsibilities'].fillna('текст отсутствует')
text_data['responsibilities'].isna().sum()

In [None]:
text_data['requirements_qualifications'] = text_data['requirements_qualifications'].fillna('требования отсутствуют')
text_data['requirements_qualifications'].isna().sum()

##additional_info

In [None]:
text_data['additional_info'] = text_data['additional_info'].fillna('информация отсутствует')
text_data['additional_info'].isna().sum()

##job_location_additional_address_info


In [None]:
whole_data['job_location_additional_address_info'] = whole_data['job_location_additional_address_info'].fillna('данные отсутствуют')
whole_data['job_location_additional_address_info'].isna().sum()

##job_benefits_other_benefits

In [None]:
whole_data['job_benefits_other_benefits'] = whole_data['job_benefits_other_benefits'].fillna('отсутствуют')
whole_data['job_benefits_other_benefits'].isna().sum()

##education_requirements_education_type

In [None]:
whole_data['education_requirements_education_type'] = whole_data['education_requirements_education_type'].fillna('не требуется')

## education_academic_degree

In [None]:
explore(whole_data["education_academic_degree"])

In [None]:
whole_data["education_academic_degree"] = whole_data["education_academic_degree"].fillna("Незаполнено")

## retraining_condition

In [None]:
explore(whole_data["retraining_condition"])

In [None]:
whole_data["retraining_condition"] = whole_data["retraining_condition"].fillna("Условия отсутствует")

## job_benefits

In [None]:
explore(whole_data["job_benefits"])

In [None]:
whole_data["job_benefits"] = whole_data["job_benefits"].fillna("Привелегии отсутствуют")

## requirements_required_certificates

In [None]:
explore(whole_data["requirements_required_certificates"])

In [None]:
whole_data["requirements_required_certificates"] =  whole_data["requirements_required_certificates"].fillna("требования отсутствуют")

## career_perspective

In [None]:
explore(whole_data["career_perspective"])

In [None]:
whole_data["career_perspective"] = whole_data["career_perspective"].fillna("перспективы отсутствуют")

## premium_type

In [None]:
explore(whole_data["premium_type"])

In [None]:
whole_data["premium_type"].value_counts()

In [None]:
whole_data["premium_type"] = whole_data["premium_type"].fillna("Премия не оговаривается")

## requirements_id_priority_category

In [None]:
explore(whole_data["requirements_id_priority_category"])

In [None]:
whole_data["requirements_id_priority_category"] = whole_data["requirements_id_priority_category"].fillna("Любые кадры")

## education_requirements_speciality

In [None]:
explore(whole_data["education_requirements_speciality"])

In [None]:
whole_data["education_requirements_speciality"] = whole_data["education_requirements_speciality"].fillna("Не требуется специальное образование")

# Obtain Text embeddings

In [None]:
import sys
import pickle
import pandas as pd
import numpy as np
import torch
import gc
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader

In [None]:
def clean_sentence(sentence):
    return ''.join(map(lambda c: c if c.isalpha() else ' ', sentence.lower()))

In [None]:
text_data

In [None]:
text_data['responsibilities'] = text_data['responsibilities'].apply(clean_sentence)

In [None]:
whole_data['responsibilities'].value_counts()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")

In [None]:
model.to(torch.device("cuda"))

In [None]:
def tokenize(tokenizer, sentences: list, seq_len) -> tuple:
    input_ids = []
    attention_masks = []
    for row in tqdm(sentences):
        encoded_dict = tokenizer.encode_plus(
                            row,                  
                            add_special_tokens = True,
                            max_length = seq_len,           
                            truncation=True,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = "pt", 
                      )
        
        input_ids.append(encoded_dict['input_ids'])
        
        attention_masks.append(encoded_dict['attention_mask'])
    return input_ids, attention_masks

In [None]:
gc.collect()

In [None]:
embedding_mappers = []


unique_values =  text_data['responsibilities'].unique()
unique_embeddings = dict()

texts_input_ids, texts_attention_masks = tokenize(tokenizer, list(unique_values), 200)
texts_input_ids = torch.cat(texts_input_ids, dim=0)
texts_attention_masks = torch.cat(texts_attention_masks, dim=0)

dataset = TensorDataset(texts_input_ids, texts_attention_masks)
dataloader = torch.utils.data.DataLoader(
                    dataset, 
                    batch_size=1)

model.eval()
i = 0
for batch in tqdm(dataloader):
    input_ids = batch[0].to(torch.device("cuda"))
    attention_mask = batch[1].to(torch.device("cuda"))
    embedding = model(input_ids, attention_mask)["pooler_output"].detach().cpu().numpy()
    
    unique_embeddings[unique_values[i]] = embedding
    i += 1
    
embedding_mappers.append(unique_embeddings)

with open(f"{DATA}/text_embeddings/responsibilities.pkl", "wb") as fout:
    pickle.dump(unique_embeddings, fout)