In [1]:
PATH = '..'
DATA_SOURCE = f'{PATH}/data/source'
DATA_PROCESSED = f'{PATH}/data/processed'

In [2]:
import os
import pandas as pd
import numpy as np
import polars as pl
from IPython.display import display, Markdown
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import json
from itertools import product

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

# case_2_reference_without_resume_sorted

In [3]:
with open(f'{DATA_SOURCE}/case_2_reference_without_resume_sorted.json') as f:
    case_2_reference_without_resume_sorted = json.load(f)

In [4]:
case_2_reference_without_resume_sorted.keys()

dict_keys(['vacancy', 'resumes'])

In [5]:
vacancy_test = pd.Series(case_2_reference_without_resume_sorted['vacancy'])

In [6]:
vacancy_test

uuid                        8b9c8d16-c7f0-38a2-b80c-d94030c15a6f
name                                            Java разработчик
keywords                  Kafka, Java, RxJava, Hystrix, MongoDB 
description    Требования: 4+ года опыта работы с Java 8+ или...
comment                                                     None
dtype: object

In [7]:
resume_test = pd.DataFrame(case_2_reference_without_resume_sorted['resumes'])

In [8]:
resume_test.sample(3).T

Unnamed: 0,94,25,49
uuid,0fe70aee-589a-3ba7-8479-d8f2f83b0921,42eae4bf-4826-3105-b197-dc0afb063714,c83fb1c9-2b90-3dab-9d53-05a883e3e0b7
first_name,Вениамин,Станислав,Олег
last_name,Фомин,Ситников,Орлова
birth_date,1986-01-16,1992-01-24,1992-10-04
country,Россия,Россия,Россия
city,Москва,Сочи,Санкт-Петербург
about,"Креативен – не редко, пытаюсь взглянуть на пр...",В 2018 году я сменил профессию. Причиной этог...,
key_skills,"Spring Framework, Hibernate ORM, SQL, PostgreS...",Git Java Spring Framework Apache Maven Spring ...,"Java, Kotlin, Spring Framework (MVC, Data), Sp..."
experienceItem,"[{'starts': '2023-01-01', 'ends': None, 'emplo...","[{'starts': '2021-11-28', 'ends': None, 'emplo...","[{'starts': '2023-11-01', 'ends': '2024-01-06'..."
languageItems,"[Русский, Английский]",,


# case_2_data_for_members

In [9]:
case_2_data_for_members = pd.read_json(f'{DATA_SOURCE}/case_2_data_for_members.json')

In [10]:
case_2_data_for_members.head()

Unnamed: 0,vacancy,failed_resumes,confirmed_resumes
0,{'uuid': '779f3a59-206a-3241-adc4-d7db504f960b...,[{'uuid': '74392e00-ecfb-335b-9fc1-c2652dca06e...,[{'uuid': '8c8cf797-2c6b-3f4b-b28b-20d57bd88b8...
1,{'uuid': '7a4813fc-43bc-3896-a607-4c8682b01002...,[{'uuid': '254487e1-81ba-3f2b-9f15-eba98d891ef...,[{'uuid': '23ca55a4-2257-3cbc-a34f-d5e1b98d8c2...
2,{'uuid': 'c03085c3-9b1e-3564-bb1e-59aa72e5fbca...,[{'uuid': '8746a855-022c-34d4-9b55-58da5483c25...,[{'uuid': '95cd87f6-0495-36e5-adad-0782a1ac435...
3,{'uuid': 'a8dd83c3-178d-3c70-90c2-7c3648f6b96a...,[{'uuid': '557c9b5b-9707-360b-bb1f-18c3c1b9439...,[{'uuid': '077836a8-16a8-34f1-a192-fe82ebc8bc9...
4,{'uuid': '9d98eba0-13bb-38d3-b742-4fd445954b3d...,[{'uuid': '821b6466-f3e2-37c9-b44f-676d91bde04...,[{'uuid': '2e517375-ff7d-3781-ae5c-2b0784dbc2b...


## Вакансии

In [11]:
# получение датафрейма вакансий
df_vacancy = pd.DataFrame.from_records(case_2_data_for_members['vacancy'])
df_vacancy = df_vacancy.append(vacancy_test, ignore_index=True)
df_vacancy = df_vacancy.rename({'name': 'title'}, axis=1)
df_vacancy['title'] = df_vacancy['title'].str.strip()
df_vacancy.to_pickle(f'{DATA_PROCESSED}/df_vacancy.pickle')

  df_vacancy = df_vacancy.append(vacancy_test, ignore_index=True)


In [12]:
df_vacancy.sample(3).T

Unnamed: 0,11,6,9
uuid,dfaf7cc8-3726-361e-b7cf-6d9746d6bf77,a8f56ed3-3ef3-365d-ade4-2df4db5d4af8,01713376-e04d-3f9d-9287-7a5ff74918c3
title,Тестировщик,Системный аналитик комманда Залоги,Системный аналитик финтех
keywords,,,
description,Грейд до 13 МСК/СПБ - 283000 Регионы - 233500...,Стрим занимается системой управления залогами...,"Системный аналитик (14 грейд middle, 15 грейд..."
comment,,,


## Резюме

In [13]:
def transform_resume(df_resume):
    df_resume = df_resume.copy()
    
    if 'languageItems' in df_resume.columns and 'languageItem' in df_resume.columns:
        df_resume['language'] = df_resume['languageItems'].fillna(df_resume['languageItem'])
    elif 'languageItems' in df_resume.columns:
        df_resume['language'] = df_resume['languageItems']
    else:
        df_resume['language'] = df_resume['languageItem']
        

    df_postions = df_resume \
        .set_index('uuid')['experienceItem'] \
        .explode() \
        .apply(pd.Series) \
        .reset_index()
    df_postions['position'] = df_postions['position'].str.strip()
    df_postions['description'] = df_postions['description'].str.strip()

    df_postions['ends'] = df_postions['ends'].fillna(df_postions['ends'].dropna().max())
    df_postions['starts'] = df_postions['starts'].fillna(df_postions['starts'].dropna().min())
    #df_postions = df_postions[~df_postions['starts'].isna()]
    df_postions['starts'] = pd.to_datetime(df_postions['starts'])
    df_postions['ends'] = pd.to_datetime(df_postions['ends'])
    df_postions['days'] = (df_postions['ends'] - df_postions['starts']).dt.days

    # description
    description = df_postions.groupby('uuid')['description'].agg(lambda x: '. \n'.join(x.fillna('').str.strip().tolist()))
    df_resume['all_description'] = df_resume['uuid'].map(description)

    # days
    days = df_postions.groupby('uuid').agg({'days': ['min', 'max', 'mean']})
    days.columns = [f'{c1}_{c2}' for c1, c2 in days.columns]
    days = days.reset_index()
    df_resume = df_resume.merge(days, on='uuid')

    # last
    df_postions['last_fl'] = df_postions['starts'] == df_postions.groupby('uuid')['starts'].transform('max')
    last = df_postions.query('last_fl').drop_duplicates(subset='uuid', keep='first')
    df_resume['last_description'] = df_resume['uuid'].map(last.set_index('uuid')['description'])
    df_resume['last_days'] = df_resume['uuid'].map(last.set_index('uuid')['days'])

    df_postions = df_postions.query('position != ""')
    df_postions['last_fl'] = df_postions['starts'] == df_postions.groupby('uuid')['starts'].transform('max')
    last = df_postions.query('last_fl').drop_duplicates(subset='uuid', keep='first')
    df_resume['title'] = df_resume['uuid'].map(last.set_index('uuid')['position']).fillna('')

    # education
    education = df_resume \
        .set_index('uuid')['educationItem'] \
        .explode() \
        .apply(pd.Series) \
        .drop(0, axis=1) \
        .reset_index() \
        .fillna('')
    clms = ['organization', 'faculty', 'specialty', 'result', 'education_level']
    education = education.groupby('uuid')[clms].agg(lambda x: '. '.join(x.str.strip().tolist()))
    education = education.agg(lambda x: '. '.join(x.tolist()), axis=1)
    for i in range(100):
        education = education.str.replace('\.\.', '.').replace('\. \.', '\.').str.replace('\. \.', '.')
    df_resume['education'] = df_resume['uuid'].map(education)
    df_resume = df_resume.drop(['experienceItem', 'educationItem'], axis=1)


    clms = ['all_description', 'key_skills', 'education', 'language']
    for col in clms:
        df_resume[col] = df_resume[col].fillna('')
    df_resume['description'] = df_resume.apply(lambda x: '. \n'.join([str(x[col]) for col in clms if x[col] != '']), axis=1)
    
    return df_resume

In [14]:
failed_resumes = pd.DataFrame.from_records(case_2_data_for_members['failed_resumes'].explode().values)
confirmed_resumes = pd.DataFrame.from_records(case_2_data_for_members['confirmed_resumes'].explode().values)

failed_resumes = transform_resume(failed_resumes)
confirmed_resumes = transform_resume(confirmed_resumes)
resume_test = transform_resume(resume_test)

  education = education.str.replace('\.\.', '.').replace('\. \.', '\.').str.replace('\. \.', '.')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_postions['last_fl'] = df_postions['starts'] == df_postions.groupby('uuid')['starts'].transform('max')
  education = education.str.replace('\.\.', '.').replace('\. \.', '\.').str.replace('\. \.', '.')
  education = education.str.replace('\.\.', '.').replace('\. \.', '\.').str.replace('\. \.', '.')


In [15]:
resume_test['title'].unique()

array(['Team Lead (Senior Java Developer)', 'Senior Java Developer',
       'Ведущий java разработчик', 'Ведущий разработчик',
       'Senior Backend Devloper', 'Java Software Engineer',
       'Senior java developer', 'Java-разработчик',
       'Java Software Developer',
       'Старший программист-разработчик Kotlin + Java',
       'Старший Java-разработчик', 'Ведущий инженер-программист',
       'Java Developer', 'Senior Software Developer Team Lead',
       'Senior Java Software Engineer', 'tech expert',
       'Главный инженер по разработке', 'Разработчик', 'Java разработчик',
       'Программист-разработчик', 'Senior java разработчик',
       'Middle Java Developer', 'Team Lead Java разработчик',
       'Старший разработчик', 'Team/Tech Lead, Senior Software Engineer',
       'Java/Kotlin Developer', 'Senior Backend Developer',
       'Старший инженер', 'Backend developer', 'Backend Tech Lead',
       'senior java developer', 'Software Engineer',
       'Lead backend developer', 

In [16]:
df_resume = pd.concat([
    failed_resumes, 
    confirmed_resumes, 
    resume_test,
], sort=False, ignore_index=True)

# одно такое найдено
mask = df_resume['title'] == ''
df_resume.loc[mask, 'title'] = 'Системный аналитик'

# save
df_resume.to_pickle(f'{DATA_PROCESSED}/df_resume.pickle')

In [17]:
df_resume.sample(3).T

Unnamed: 0,584,466,395
uuid,72e10137-dbd8-373c-a6aa-3babc25a20f2,da4c44dd-7c00-3f75-98b4-096b533488a4,6d2a37b5-5e4e-373d-881a-750af37f8284
first_name,Варвара,Изольда,Павел
last_name,Блинов,Дроздов,Никифорова
birth_date,1995-08-02,1996-01-01,
country,Россия,Россия,Россия
city,Москва,,Москва
about,"Увлекаюсь программированием, постоянно стараюс...",Занимаюсь разработкой бекенда и фронтенда веб...,
key_skills,,"Kotlin, TypeScript, Java, Vue.js, Spring Boot,...","Java 11, Kotlin, Spring Framework (Boot, WebFl..."
languageItems,,,
language,"[Русский, Английский]",,


In [18]:
df_resume['uuid'].duplicated().sum()

0

# Обучающая выборка

In [19]:
# создание обучающей выборки
case_2_data_for_members['uuid_vacancy'] = case_2_data_for_members['vacancy'].str['uuid']

func = lambda x: [item['uuid'] for item in x]
case_2_data_for_members['failed'] = case_2_data_for_members['failed_resumes'].apply(func)
case_2_data_for_members['confirmed'] = case_2_data_for_members['confirmed_resumes'].apply(func)

failed = case_2_data_for_members[['uuid_vacancy', 'failed']] \
    .explode('failed') \
    .assign(label=0) \
    .rename({'failed': 'uuid_resume'}, axis=1)

confirmed = case_2_data_for_members[['uuid_vacancy', 'confirmed']] \
    .explode('confirmed') \
    .assign(label=1) \
    .rename({'confirmed': 'uuid_resume'}, axis=1)

df_train = pd.concat([failed, confirmed], sort=False, ignore_index=True)
df_train['split'] = 'train'

df_test = pd.DataFrame(
    product([vacancy_test['uuid']], resume_test['uuid']), 
    columns=['uuid_vacancy', 'uuid_resume']
)
df_test['split'] = 'test'

df_pairs = pd.concat([df_train, df_test], sort=False, ignore_index=True)

In [20]:
df_train.groupby('uuid_resume')['uuid_vacancy'].agg(set).apply(len).max()

1

In [21]:
df_pairs.head()

Unnamed: 0,uuid_vacancy,uuid_resume,label,split
0,779f3a59-206a-3241-adc4-d7db504f960b,74392e00-ecfb-335b-9fc1-c2652dca06e5,0.0,train
1,779f3a59-206a-3241-adc4-d7db504f960b,2b5ad5e1-1f31-3f3f-8a66-43cd89233672,0.0,train
2,779f3a59-206a-3241-adc4-d7db504f960b,ea1ac51a-e16b-367a-9216-52fb64809db1,0.0,train
3,779f3a59-206a-3241-adc4-d7db504f960b,ecfc02a1-592c-3ed0-a801-1ad9ab3d30b8,0.0,train
4,779f3a59-206a-3241-adc4-d7db504f960b,aff6b6bd-89c2-3b2c-ab2e-0b9f76ac367c,0.0,train


In [22]:
# добавление валидации
np.random.seed(42)
mask = df_vacancy['title'].str.lower().str.contains('java')
val_uuid = df_vacancy[mask].sample(frac=0.3)['uuid'].tolist()
mask = df_pairs['uuid_vacancy'].isin(val_uuid)
df_pairs.loc[mask, 'split'] = 'val'

np.random.seed(42)
df_pairs = df_pairs.sample(frac=1).reset_index(drop=True)

# save
df_pairs.to_pickle(f'{DATA_PROCESSED}/df_pairs.pickle')

In [23]:
pd.concat([
    df_train.query('label == 1')['uuid_vacancy'].value_counts().to_frame('confirmed'),
df_train.query('label == 0')['uuid_vacancy'].value_counts().to_frame('failed'),
], axis=1)

Unnamed: 0,confirmed,failed
a8dd83c3-178d-3c70-90c2-7c3648f6b96a,23,26
3c4359e8-8e2f-3c51-9b62-562080e1b549,14,59
a8f56ed3-3ef3-365d-ade4-2df4db5d4af8,13,36
9d98eba0-13bb-38d3-b742-4fd445954b3d,12,11
01713376-e04d-3f9d-9287-7a5ff74918c3,11,11
259bf318-e6a7-3b6c-93f9-e1804a89ee63,10,7
f4a7096b-e1c4-3b4c-be6a-493eb29c6b0f,10,36
ab135980-4634-3579-8707-2f094b5579a9,9,15
61a5a940-c9f2-3f9f-bbda-9cf735697878,8,17
7a4813fc-43bc-3896-a607-4c8682b01002,8,16
