In [1]:
from graphservice.neoservice import neoconnection
import pandas as pd
from bs4 import BeautifulSoup
import unicodedata

In [98]:
df = pd.read_csv('data/Resume.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [99]:
def get_experience_descriptions(text_html):
    soup = BeautifulSoup(text_html)

    experiences_list = soup.find_all('span', class_='jobline')
    # print(experiences_list)
    if experiences_list:
        # print(True)
        experiences_list = [item.text.strip() for item in experiences_list ]
        experiences_list = [unicodedata.normalize("NFKD", item) for item in experiences_list if len(item)>10]
    
    return experiences_list
    

In [100]:
df['experiences'] = df['Resume_html'].apply(get_experience_descriptions)

In [101]:
df = df.explode('experiences')

In [102]:
df.drop(['Resume_str', 'Resume_html'], axis=1, inplace=True)

In [103]:
df.shape

(11472, 3)

In [106]:
set(df['Category'].values.tolist())

{'ACCOUNTANT',
 'ADVOCATE',
 'AGRICULTURE',
 'APPAREL',
 'ARTS',
 'AUTOMOBILE',
 'AVIATION',
 'BANKING',
 'BPO',
 'BUSINESS-DEVELOPMENT',
 'CHEF',
 'CONSTRUCTION',
 'CONSULTANT',
 'DESIGNER',
 'DIGITAL-MEDIA',
 'ENGINEERING',
 'FINANCE',
 'FITNESS',
 'HEALTHCARE',
 'HR',
 'INFORMATION-TECHNOLOGY',
 'PUBLIC-RELATIONS',
 'SALES',
 'TEACHER'}

In [83]:
import json
with open('data/extracted_resumes.json', 'r', encoding='utf-8') as file:
        data: dict = json.load(file)

In [91]:
from injection.cypher_queries import injections, esco_injection, skills_injection
from math import ceil
from tqdm import tqdm
from py2neo import Graph
from tools.dotenv_read import read_dotenv

graph = Graph(**read_dotenv('neo_'))
batch_size = 1
try:

    for k in data.keys():

        # logging.info(f'Injecting {k}...')
        statement = injections[k]
        node_data = data[k]

        n_batches = ceil(len(node_data) / batch_size)
        for b in tqdm(range(n_batches)):
            tx = graph.auto()
            tx.run(statement, {k: node_data[b * batch_size:(b + 1) * batch_size]})

except Exception as exc:
    raise exc

 26%|██▌       | 3002/11472 [00:24<01:08, 123.06it/s]


ClientError: [Request.InvalidFormat] Could not parse the incoming JSON

In [92]:
type(data[k][3002]['experience_description'])

float

In [94]:
df = pd.read_csv('data/Resume.csv')
df['experience_description'] = df['Resume_html'].apply(get_experience_descriptions)
df = df.explode('experience_description')
df.drop(['Resume_str', 'Resume_html'], axis=1, inplace=True)
df.dropna(inplace=True)

In [96]:
data = {
            'experiences': df.to_dict(orient='records')
        }

In [97]:
data['experiences'][3002]

{'ID': 91467795,
 'Category': 'BUSINESS-DEVELOPMENT',
 'experience_description': 'SGW Pharma Marketing is a B2B life science marketing agency focused on connecting drug development companies with target audiences (development partners, investors, service providers etc.) by developing and managing integrated, full circle marketing programs .   Oversaw business, client and vendor negotiations.  Effectively directed internal marketing, communications and sales support staff.  Improved marketing plans, sales strategies and customer relations to maximize business development.  Leveraged industry trends in client markets to shape value-added solutions and approaches for key audiences.  Streamlined operational efficiencies, developed sales tracking reports for planning by executive team.  Maintained strong understanding of competitors, their offerings and their presence across globe.'}

In [95]:
df.iloc[3002]

ID                                                                 91467795
Category                                               BUSINESS-DEVELOPMENT
experience_description    SGW Pharma Marketing is a B2B life science mar...
Name: 657, dtype: object

In [76]:
df[df['Category'].isnull()]

Unnamed: 0,ID,Category,experiences


In [None]:
df.to_dict(orient='records')

In [3]:
df = pd.read_csv('data/Resume.csv')
df.head(3)

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [4]:
df_shuf = df.sample(frac=1, random_state=2022).reset_index(drop=True)

In [5]:
df_final = pd.DataFrame()
for categ, item in df_shuf.groupby('Category'):
    df_final = pd.concat([df_final, item.iloc[:10]])


In [6]:
df_final.shape

(240, 4)

In [7]:
frm_ids = df_final['ID'].values.tolist()


In [8]:
from evaluation.evaluation_setup import Evaluator
evl = Evaluator()

In [None]:
formulas = [f'formula{str(i+1)}' for i in range(4)]
cross_sector_options = [False, True]
for option in cross_sector_options:
    no_cross_sector = "_no_cross_sector" if option else ""
    for formula in formulas:
        
        test_scores = evl.get_sim_profiles_results(sourceIds=frm_ids, evaluation_path='tests/results.csv',
                                test_name=formula+no_cross_sector, sim_score_formula=formula,
                                no_cross_sector=option, topk=10)

In [2]:
import json
with open('data/extracted_resumes.json', 'r', encoding='utf-8') as file:
        data: dict = json.load(file)
    
data = pd.DataFrame(data['experiences'])
print(data.shape)
# data.reset_index(inplace=True)
# data.rename(columns={'index':'ind'}, inplace=True)
data.head(3)


(11468, 4)


Unnamed: 0,ind,ID,Category,experience_description
0,0,16852973,HR,"Helps to develop policies, directs and coordin..."
1,1,16852973,HR,Reviewed medical bills for the accuracy of the...
2,2,16852973,HR,"Performed duties including but not limited to,..."


In [11]:
import numpy as np
exp_embeddings = np.load('data/experiences_embedddings.npy')

data['emb'] = data.apply(lambda row: exp_embeddings[row[0]], axis=1)

In [11]:
q = data[data['ID'].isin([16852973])]['emb'].values
np.stack(q).shape

(4, 768)

In [12]:
exp_embeddings.shape

(11468, 768)

In [13]:
data.shape

(11468, 5)

In [14]:
data[data['experience_description'].apply(lambda x: True if len(x)>10 else False)]

Unnamed: 0,ind,ID,Category,experience_description,emb
0,0,16852973,HR,"Helps to develop policies, directs and coordin...","[0.037985392, -0.024676878, -0.01566813, -0.02..."
1,1,16852973,HR,Reviewed medical bills for the accuracy of the...,"[-0.001415999, -0.031405974, -0.009812766, -0...."
2,2,16852973,HR,"Performed duties including but not limited to,...","[0.030036539, -0.021883393, 0.011026299, 0.037..."
3,3,16852973,HR,Provided assistance to various department head...,"[0.04964292, -0.045114703, -0.024170047, -0.00..."
4,4,22323967,HR,Managed communication regarding launch of Oper...,"[0.025132444, 0.01740369, -0.020966815, -0.070..."
...,...,...,...,...,...
11463,11463,21190805,AVIATION,Collaborated with Customer Care / In-Flight Ma...,"[0.044099122, -0.033639565, -0.004084603, -0.0..."
11464,11464,37473139,AVIATION,Maintaining and reconciling inventory through ...,"[0.016481841, 0.022426397, -0.015682258, -0.00..."
11465,11465,37473139,AVIATION,Shipping: Fulfill orders with use of Infrared ...,"[0.015845729, -0.023523834, -0.02283112, -0.00..."
11466,11466,37473139,AVIATION,Performance areas include: Interviewing prospe...,"[-0.0073799593, 0.08391202, -0.01131346, -0.00..."


In [17]:
data.isna().values.any()

False

In [18]:
import numpy as np
q = np.load('data/ESCO/esco_embedddings.npy')

In [24]:
def get_experience_descriptions(text_html):
    soup = BeautifulSoup(text_html, "html.parser")

    experiences_list = soup.find_all('span', class_='jobline')
    # print(experiences_list)
    if experiences_list:
        # print(True)
        experiences_list = [item.text.strip() for item in experiences_list ]
        experiences_list = [str(unicodedata.normalize("NFKD", item)) for item in experiences_list if len(item)>10]
    
    return experiences_list


def extract_experiences(df):
    df['experience_description'] = df['Resume_html'].apply(get_experience_descriptions)
    df = df.explode('experience_description')
    df.drop(['Resume_str', 'Resume_html'], axis=1, inplace=True)
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    df.rename(columns={'index':'ind'}, inplace=True)

    return df

In [30]:
df = pd.read_csv('data/Resume.csv')

# df = extract_experiences(df)

# data = {
#     'experiences': df.to_dict(orient='records')
# }

In [31]:
df['experience_description'] = df['Resume_html'].apply(get_experience_descriptions)
df = df.explode('experience_description')
df.drop(['Resume_str', 'Resume_html'], axis=1, inplace=True)
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)
df.reset_index(inplace=True)
df.rename(columns={'index':'ind'}, inplace=True)

In [32]:
df.head(3)

Unnamed: 0,ind,ID,Category,experience_description
0,0,16852973,HR,"Helps to develop policies, directs and coordin..."
1,1,16852973,HR,Reviewed medical bills for the accuracy of the...
2,2,16852973,HR,"Performed duties including but not limited to,..."


In [1]:
from skills_extract.esco_skills_match import SkillExtractor
skill_extr_obj = SkillExtractor()

In [20]:
topk=5
texts = [
"Assistant in site supervision and design of interior fittings of a hotel.",


"Design of houses and gas stations in Autocad R12.",


"Process corporate actions in clients' holdings.",


"Assistant in HVAC design (Autocad), PEB / insulation calculations.",


"Co-ordination of interior finishes, preparation of shop drawings in Autocad 2000 for the fit-out of the Petronas Twin Towers(office building)."
]
emb_doc = skill_extr_obj.model.encode(texts)
D, i = skill_extr_obj.index.search(emb_doc, topk)
q = skill_extr_obj.get_skills_by_ids(i[4].tolist())[['reuseLevel', 'preferredLabel', 'description']]
q

Unnamed: 0,reuseLevel,preferredLabel,description
9899,sector-specific,create architectural sketches,Create architectural sketches for the design a...
10061,cross-sector,draw blueprints,"Draw layout specifications for machinery, equi..."
12080,cross-sector,develop design plans,Develop design plans by using computer-aided-d...
6429,cross-sector,make architectural mock-ups,Make a scale model that represents the vision ...
4451,occupation-specific,define set materials,"Make set construction drawings, define a prope..."


In [15]:
topk=20
text = """
Assistant in site supervision and design of interior fittings of a hotel.


Design of houses and gas stations in Autocad R12.


Process corporate actions in clients' holdings.


Assistant in HVAC design (Autocad), PEB / insulation calculations.


Co-ordination of interior finishes, preparation of shop drawings in Autocad 2000 for the fit-out of the Petronas Twin Towers(office building).
"""
emb_doc = skill_extr_obj.model.encode([text])
D, i = skill_extr_obj.index.search(emb_doc, topk)
q = skill_extr_obj.get_skills_by_ids(i[0].tolist())[['reuseLevel', 'preferredLabel', 'description']]
q

Unnamed: 0,reuseLevel,preferredLabel,description
4849,sector-specific,collaborate with technical experts on artworks,"Collaborate with engineers, mechanics, and oth..."
6607,cross-sector,design buildings,Design buildings and housing projects in coope...
7717,sector-specific,advise on architectural matters,"Provide advice on architectural design, based ..."
12256,cross-sector,manage artworks' installation in the gallery,Organise tools and equipment; ensure correct a...
11510,occupation-specific,design open spaces,Design social areas and open spaces working in...
13265,cross-sector,draw design sketches,Create rough pictures to assist in creating an...
6790,occupation-specific,teach design and applied arts principles,Instruct students in the theory and practice o...
8726,sector-specific,design original furniture,Master and develop industrial aesthetics throu...
11953,sector-specific,handle art,Work directly with objects in museums and art ...
11216,sector-specific,collaborate with designers,Communicate and collaborate with fellow design...


In [4]:
i

array([[ 4849,  6607,  7717, 12256, 11510]], dtype=int64)