# Job title prediction with embedding

In [1]:
%load_ext autoreload
%autoreload 2
import sys; sys.path.append('../')                                                                                      

In [2]:
import pandas as pd
import numpy as np
import cufflinks as cf; cf.go_offline()

In [3]:
import tensorflow as tf
tf.__version__

'2.7.0'

In [4]:
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, GlobalAveragePooling1D, Dropout, Reshape, Activation

In [5]:
pd.set_option('max_colwidth',250)

## Loading the data

In [6]:
df_pos = pd.read_csv('datasets/data_clean.csv')
df_pos.sample(2)

Unnamed: 0,ExtJobTitleText,JobTitle,Description
25278,Seasonal Warehouse Associate - Milton,Warehouse Worker,NOW OFFERING A $500 SIGN-ON BONUS!!EARN UP TO $19.50 ON NIGHT SHIFTSAttend one of our upcoming walk-in hiring events! No appointment necessary!Walk-ins welcome—or apply online and then schedule an appointment that works for you. Hiring EventDate/...
5179,Assembler - Second Shift,Assembler,"Assemblers Needed – Entry Level - Day Shift –must be able to start at 5am - needed for a long term contract opportunity with our client located in Tempe, AZ What You Will Be Doing:Tube Assembly processors perform day to day production activities ..."


In [7]:
df_pos.shape

(25405, 3)

In [8]:
df_pos["JobTitle_tokenized"] = pd.factorize(df_pos.JobTitle)[0]

In [24]:
# df_pos["ext_job_title_tokenized"] = pd.factorize(df_pos.ExtJobTitleText)[0]

In [12]:
df_pos.JobTitle.value_counts().to_frame(name='count')

Unnamed: 0,count
Warehouse Worker,1000
Packager,1000
Pediatric Speech Language Pathologist,958
Retail Sales Representative,810
Registered Nurse (RN),701
...,...
Maintenance Planner,10
Hospital Admissions Coordinator,10
Finance Manager,10
Saw Operator,10


### Train and test set split

In [82]:
df_pos = df_pos[df_pos['Description'].notnull()]
df_pos.dropna(inplace=True)

In [18]:
text_col, target_col = 'Description', 'JobTitle'

from sklearn.preprocessing import LabelBinarizer
label_as_binary = LabelBinarizer()

label_as_binary.fit(df_pos[target_col])

training_set = df_pos[[text_col, target_col]].sample(frac=0.8, random_state=41)
test_set = df_pos[~df_pos.index.isin(training_set.index)][[text_col, target_col]]

train__y_labels = label_as_binary.transform(training_set[target_col])
test__y_labels = label_as_binary.transform(test_set[target_col])

In [19]:
assert(len(training_set) + len(test_set) == len(df_pos))

What is a good size for the sequence_length? 

In [34]:
df_pos.Description.apply(lambda x: len(x.split(' '))).quantile([0.5,0.6,0.7,0.8,0.9,0.95,0.99])

0.50    225.00
0.60    285.00
0.70    334.00
0.80    386.00
0.90    516.00
0.95    606.85
0.99    940.00
Name: Description, dtype: float64

What is a good size for the vocabulary? 

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(min_df=5).fit(df_pos['Description'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

16663


In [38]:
vocab_size = 16663
sequence_length = 516

# Use the text vectorization layer to normalize, split, and map strings to integers. Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    #standardize=lambda text: tf.strings.lower(text), # You can use your own normalization function here
    max_tokens=vocab_size,
    output_mode='int',
    name = 'Text_processing',
    output_sequence_length=sequence_length
)

In [41]:
vectorize_layer.adapt(training_set[text_col])

In [44]:
sample_description = training_set[text_col].sample().iloc[0]
print(sample_description)
vectorize_layer(sample_description)

Are you a Senior Proposal Writer who enjoys the challenge of writing proposals and working on a collaborative team? Are you looking for an opportunity to work with an established company that values its employee’s enthusiasm and technical contributions? If so, we want to talk to you! Our client has an exciting remote (IN THE USA) contract to hire opportunity for a Senior Proposal Writer! The ideal candidate is seeking challenging work and


<tf.Tensor: shape=(516,), dtype=int64, numpy=
array([  13,   10,    5, 1270, 1800, 4035,   37, 2644,    4, 2531,    6,
       1643, 3457,    2,   83,   23,    5,  674,   29,   13,   10,   41,
          7,   20,   40,    3,   17,   11,   20, 1025,   63,   22,  585,
        414, 5352, 7513,    2,  572,  541,   48,  115,   14,  515,    3,
       2520,    3,   10,    9,  293,  154,   20, 1935, 1653,    8,    4,
       4343,  419,    3,  240,   40,    7,    5, 1270, 1800, 4035,    4,
        709,  765,   12,  206, 2934,   17,    2,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

In [45]:
# for token in vectorize_layer(sample_description).numpy()[:20]:
#     print(f"{token} ---> ",vectorize_layer.get_vocabulary()[token])

### Modeling

### Descriptions squashed into 1 average embedding vector, size 16

In [None]:
embedding_dim1=32

model1 = tf.keras.Sequential([
    vectorize_layer1,
    Embedding(vocab_size1, embedding_dim1, name="embedding"),
    GlobalAveragePooling1D(),
#     Dropout(0.03),
    Dense(4096, activation='elu', name='hidden_layer'),
#     Dropout(0.01),
#     Dense(2048, activation='elu', name='hidden_layer2'),
#     Dropout(0.02),
#     Dense(1024, activation='relu', name='hidden_layer2'),
    Dense(df_pos1.JobTitle.nunique(), name = 'output_layer', activation='softmax')
])

In [None]:
model1.summary()

In [None]:
tf.keras.utils.plot_model(model1, show_dtype=True, show_shapes=True, show_layer_names=True)

In [None]:
model1.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics = ['accuracy'])

#### Creating checkpoints for model weights

In [47]:
# checkpoint_path = 'Deep_models_weights'
cp_callback = [tf.keras.callbacks.ModelCheckpoint(
    filepath='Deep_model_weights/model.{epoch:02d}-val_loss{val_loss:.3f}-val_precision{val_precision:.3f}-val_recall{val_recall:.3f}.tf', 
    verbose=1, 
    save_weights_only=True,
    save_freq= 'epoch')]

In [None]:
# from tensorflow import keras

# checkpoint_path = 'Deep_models_weights'
# callbacks  = [
#     keras.callbacks.ModelCheckpoint(
#         filepath=checkpoint_path, 
#         monitor='val_loss',
#         verbose=1,
#         save_best_only=True,
#         save_weights_only=True,
#         save_freq='epoch'),
#     keras.callbacks.EarlyStopping(
#         monitor='val_recall',
#         min_delta=0,
#         patience=20,
#         verbose=1)
#     ]

#### Model fit

In [None]:
%%time
history = model.fit(
    training_set[text_col],
    train__y_labels,
    epochs=10,
    batch_size=1024,
    verbose=1,
    callbacks=cp_callback,
    validation_data = (test_set[text_col], test__y_labels)
)

### !!!! ADD OR DELETE - Descriptions words concatinated

In [None]:
embedding_dim=32

model2 = tf.keras.Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name="embedding"),
#     GlobalAveragePooling1D(),
    Reshape((embedding_dim * sequence_length, ), name='concat_words'),
#     Dropout(0.1),
    Dense(4096, activation='relu', name='hidden_layer_1'),
#     Dropout(0.04),
#     Dense(2048, activation='relu', name='hidden_layer_2'),
    Dense(df_pos.JobTitle.nunique(), name = 'output_layer')
])

In [None]:
model2.summary()

In [None]:
tf.keras.utils.plot_model(model2, show_dtype=True, show_shapes=True, show_layer_names=True)

In [None]:
model2.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics = ['accuracy'])

In [None]:
%%time
history = model2.fit(
    training_set[text_col],
    train__y_labels,
    epochs=10,
    batch_size=1024,
    verbose=1,    
    validation_data = (test_set[text_col], test__y_labels)
)

In [None]:
test_set['token_with_best_prediction'] = model.predict(test_set[text_col]).argmax(axis=1)

In [None]:
test_set['prob_token_with_best_prediction'] = model.predict(test_set[text_col]).max(axis=1)

In [None]:
test_set.head(20)

### !!!! ADD OR DELETE - Adding additional features (besides text)

#### Extracting Year column

In [None]:
from dateutil.parser import parse
def extract_year_from_title(title):
    try:
        year = parse(title, fuzzy=True).year
        return str(int(year)) if year > 1800 else None
    except:
        return None

In [None]:
sample_title = wine_reviews.sample().title.iloc[0]
print(f'Title is: {sample_title}. Extracted year: {extract_year_from_title(sample_title)}')

In [None]:
wine_reviews['year'] = wine_reviews.title.apply(extract_year_from_title)
wine_reviews['year'].value_counts(dropna=False).head(10)

Is the year input informative? 

In [None]:
wine_reviews.groupby('year').points.describe().query('count > 20').sort_values(by='mean',ascending=False).head()

#### Preparing the input features

In [None]:
wine_reviews = wine_reviews.reset_index() # To ensure correctness with the below join operations

In [None]:
description_tokens = vectorize_layer(wine_reviews[text_col])

In [None]:
description_cols = [f'w_{i}' for i in range(1, description_tokens.shape[1] + 1)]
features_df = pd.DataFrame(description_tokens.numpy(), columns=description_cols)

In [None]:
features_df = features_df.join(wine_reviews[['points','price','country','year','variety','province']])
features_df.head()

In [None]:
features_df[categorical_featurs] = features_df[categorical_featurs].fillna('Unknown')
features_df.price = features_df.price.fillna(features_df.price.mean())

In [None]:
features_df.country = pd.factorize(features_df.country)[0]
features_df.year = pd.factorize(features_df.year)[0]
features_df.variety = pd.factorize(features_df.variety)[0]
features_df.province = pd.factorize(features_df.province)[0]
features_df.year = pd.factorize(features_df.year)[0]

In [None]:
features_df.head()

In [None]:
features_df[categorical_featurs].apply(lambda x: pd.Series({'nunique': x.nunique(),
                                                            'max': x.max(),
                                                            'min': x.min()}))

In [None]:
from tensorflow.keras.layers import Input
from tensorflow.keras import layers, Model

In [None]:
description_input = Input(
    shape=(sequence_length,), dtype='int64', name='description'
)

year_input = Input(
    shape=(1,), name="year", dtype='int64'
)  

country_input = Input(
    shape=(1,), name="country", dtype='int64'
)  

province_input = Input(
    shape=(1,), name="province", dtype='int64'
)

variety_input = Input(
    shape=(1,), name="variety", dtype='int64'
)

price_input = Input(
    shape=(1,), name="price",
)

word_features = layers.Embedding(vocab_size, embedding_dim, input_length=sequence_length, name='word_embeddings')(description_input)
word_features = layers.Reshape((embedding_dim * sequence_length,), name='concat_words')(word_features)

year_features = layers.Embedding(100, 3, name='year_embeddings')(year_input)
year_features = layers.Reshape((3,), name='concat_year')(year_features)

country_features = layers.Embedding(50, 2, name='country_embeddings')(country_input)
country_features = layers.Reshape((2,), name='concat_country')(country_features)

province_features = layers.Embedding(500, 5, name='province_embeddings')(province_input)
province_features = layers.Reshape((5,), name='concat_province')(province_features)

variety_features = layers.Embedding(1000, 4, name='variety_embeddings')(variety_input)
variety_features = layers.Reshape((4,), name='concat_variety')(variety_features)

# Merge all available features into a single large vector via concatenation
feature_vector = layers.concatenate([word_features, year_features, country_features, province_features, variety_features, price_input])
x = layers.Dropout(0.2)(feature_vector)
x = layers.Dense(256, activation='relu', name='Hidden')(x)
# Outputs:
predictions = layers.Dense(1, name="output")(x)

# Instantiate an end-to-end model predicting E,I,O:
model = Model(
    inputs=[description_input, year_input, country_input, province_input, variety_input, price_input],
    outputs=predictions,
)

In [None]:
tf.keras.utils.plot_model(model, show_dtype=True, show_shapes=True, show_layer_names=True)

In [None]:
model.summary()

In [None]:
training_set = features_df.sample(frac=0.8, random_state=42)
test_set = features_df[~features_df.index.isin(training_set.index)]

In [None]:
assert(len(training_set) + len(test_set) == len(wine_reviews))

In [None]:
model.compile(
    optimizer=tf.optimizers.Adam(),
    loss='mean_absolute_error')

In [None]:
%%time
history = model.fit(
    {"description": training_set[description_cols].values, 
     "year": training_set['year'].values,
     "country": training_set['country'].values,
     "province": training_set['province'].values,
     "variety": training_set['variety'].values, 
     'price': training_set['price'].values},
    
    {"output": training_set['points'].values},
    validation_data=([test_set[description_cols].values, 
                      test_set['year'].values, 
                      test_set['country'].values, 
                      test_set['province'].values, 
                      test_set['variety'].values, 
                      test_set['price'].values],
                     test_set['points'].values),
    epochs=10,
    batch_size=512,
    verbose=1)

In [None]:
history.history

In [None]:
test_set['dnn_prediction'] = model.predict({'description': test_set[description_cols], 
                                            'year': test_set['year'], 
                                            'country': test_set['country'], 
                                            'province': test_set['province'], 
                                            'variety': test_set['variety'], 
                                            'price': test_set['price']})

In [None]:
calc_prediction_quality(test_set, 'dnn_prediction', target_col)

## Using pretrained embeddings

In [13]:
%%capture
%pip install sentence-transformers

In [14]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [16]:
import pickle

### description_embeddings

In [39]:
description_vectors_path = 'Transformers_pickles/descriptions_embeddings_desc.pkl'

In [41]:
%%time
description_embeddings = []
for i,description in enumerate(df_pos[text_col].values):
    description_embeddings.append((i,description, model.encode(description)))
    if (i+1) % 1000 == 0:
        print(f'Completed step {i+1} out of {len(df_pos)}')
        pickle.dump(description_embeddings, open(description_vectors_path, 'wb'))
pickle.dump(description_embeddings, open(description_vectors_path, 'wb'))

Completed step 1000 out of 25404
Completed step 2000 out of 25404
Completed step 3000 out of 25404
Completed step 4000 out of 25404
Completed step 5000 out of 25404
Completed step 6000 out of 25404
Completed step 7000 out of 25404
Completed step 8000 out of 25404
Completed step 9000 out of 25404
Completed step 10000 out of 25404
Completed step 11000 out of 25404
Completed step 12000 out of 25404
Completed step 13000 out of 25404
Completed step 14000 out of 25404
Completed step 15000 out of 25404
Completed step 16000 out of 25404
Completed step 17000 out of 25404
Completed step 18000 out of 25404
Completed step 19000 out of 25404
Completed step 20000 out of 25404
Completed step 21000 out of 25404
Completed step 22000 out of 25404
Completed step 23000 out of 25404
Completed step 24000 out of 25404
Completed step 25000 out of 25404
Wall time: 7h 30min 44s


In [43]:
assert(description_embeddings[1300][1] == df_pos.Description.iloc[1300])

In [None]:
import pickle
description_embeddings_gibui = pickle.load(open('Transformers_pickles/descriptions_embeddings_desc.pkl', 'rb'))

In [47]:
%%time
rows = []
for d in description_embeddings:
    vector = []
    vector.append(d[0])
    vector.append(d[1])
    for item in d[2]:
        vector.append(item)
    rows.append(vector)   

Wall time: 4.78 s


In [61]:
descriptions_with_sentence_embeddings_df = pd.DataFrame(rows, columns = ['row_id','description'] + [f'embedding_{i}' for i in range(768)])
descriptions_with_sentence_embeddings_df.to_pickle('descriptions_with_sentence_embeddings_df.pkl')

In [None]:
descriptions_with_sentence_embeddings_df

In [62]:
descriptions_with_sentence_embeddings_df = descriptions_with_sentence_embeddings_df.set_index('row_id').join(df_pos['JobTitle_tokenized'])

In [64]:
descriptions_with_sentence_embeddings_df.sample(3)

Unnamed: 0_level_0,description,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767,JobTitle_tokenized
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5731,"Registered Dental Hygienists: If you are passionate about the dental industry and believe in customer care, then apply for these opportunities. We are here for you if you are looking for supplemental work Monday-Friday, 1-5 days available and the...",-0.007627,0.019185,-0.008609,-0.044907,-0.02781,0.034229,-0.007145,0.038109,-0.020477,...,0.009165,0.018022,0.016249,-0.006722,0.009799,0.005243,0.039181,-0.019987,-0.028755,209.0
6787,The Staffing Specialist is responsible for providing support to the Branch Manager by assisting with daily operations for a location. This position advocates for the customer and associate experience. It also works on the process of generating sa...,0.053167,-0.076452,-0.015497,-0.022781,0.012712,0.069818,-0.01401,-0.017102,-0.012395,...,0.005767,0.010127,0.048258,-0.063438,-0.016074,-0.00794,0.015705,0.036672,-0.040522,193.0
18536,Are you committed to holding cleanliness next to godliness? Do you have a rare penchant for getting everything just right and striving for your work to be perfect? PeopleReady is now hiring meticulous Housekeepers. Housekeepers perform light clea...,0.017356,0.062786,-0.004281,-0.022958,0.007093,-0.003858,-0.042354,0.026579,-0.019894,...,-0.028089,0.024304,0.05521,-0.038802,-0.029499,0.019202,-0.001687,0.042558,0.006724,99.0


In [65]:
s = descriptions_with_sentence_embeddings_df[['description','JobTitle_tokenized']].sample()
s.values

array([['We’re looking for an exceptional School Speech Language Pathologist for a full-time position in Eudora, KS for the 2021-2022 School year, from August 11, 2021 - May 2022. This therapist will work 40 hours per week on-site with childhood and elementary students. The caseload is 35-40.Job Requirements for School Speech Language Pathologist:Desire to bring life-giving excellence to school-aged children 1 year of verifiable, supervised professional experience as a',
        321.0]], dtype=object)

In [66]:
descriptions_with_sentence_embeddings_df.query('description == @s.description.iloc[0]')['JobTitle_tokenized']

row_id
21938    321.0
Name: JobTitle_tokenized, dtype: float64

### Modedling with transformers sequence embeddings

In [67]:
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, GlobalAveragePooling1D, Dropout, Reshape, Activation

In [77]:
np.any(np.isnan(training[target_col]))

True

In [83]:
descriptions_with_sentence_embeddings_df.shape

(25404, 770)

In [84]:
descriptions_with_sentence_embeddings_df = descriptions_with_sentence_embeddings_df[descriptions_with_sentence_embeddings_df['description'].notnull()]
descriptions_with_sentence_embeddings_df.dropna(inplace=True)

In [None]:
descriptions_with_sentence_embeddings_df.shape

(25403, 770)

how did we got one none at that point is a mystery

In [86]:
text_col, target_col = 'description', 'JobTitle_tokenized'

from sklearn.preprocessing import LabelBinarizer
label_as_binary = LabelBinarizer()

label_as_binary.fit(df_pos[target_col])

training = descriptions_with_sentence_embeddings_df.sample(frac=0.8, random_state=41)
test = descriptions_with_sentence_embeddings_df[~descriptions_with_sentence_embeddings_df.index.isin(training_set.index)]

train__y_labels = label_as_binary.transform(training[target_col])
test__y_labels = label_as_binary.transform(test[target_col])

In [89]:
model = tf.keras.Sequential()
model.add(Dense(4096, input_dim=768))
model.add(Activation('elu'))
# model.add(Dense(2048, input_dim=2048))
# model.add(Activation('relu'))
model.add(Dense(df_pos.JobTitle.nunique()))
model.add(Activation('softmax'))
model.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),'accuracy']
    )

In [90]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 4096)              3149824   
                                                                 
 activation_2 (Activation)   (None, 4096)              0         
                                                                 
 dense_3 (Dense)             (None, 352)               1442144   
                                                                 
 activation_3 (Activation)   (None, 352)               0         
                                                                 
Total params: 4,591,968
Trainable params: 4,591,968
Non-trainable params: 0
_________________________________________________________________


In [91]:
f_vector = [f'embedding_{i}' for i in range(768)]
history = model.fit(training[f_vector], 
          train__y_labels, 
          validation_data=(test[f_vector], test__y_labels), 
          epochs=30,
          batch_size=1024,
          verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


* Further training

In [92]:
f_vector = [f'embedding_{i}' for i in range(768)]
history = model.fit(training[f_vector], 
          train__y_labels, 
          validation_data=(test[f_vector], test__y_labels), 
          epochs=20,
          batch_size=1024,
          verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Adding an additional column with information

### ExtJobTitleText_embeddings

In [48]:
ext_job_title_vectors_path = 'Transformers_pickles/ext_job_title_embeddings_desc.pkl'

In [49]:
%%time
ext_job_title_embeddings = []
for i,description in enumerate(df_pos['ExtJobTitleText'].values):
    ext_job_title_embeddings.append((i,description, model.encode(description)))
    if (i+1) % 1000 == 0:
        print(f'Completed step {i+1} out of {len(df_pos)}')
        pickle.dump(ext_job_title_embeddings, open(ext_job_title_vectors_path, 'wb'))
pickle.dump(ext_job_title_embeddings, open(ext_job_title_vectors_path, 'wb'))

Completed step 1000 out of 25404
Completed step 2000 out of 25404
Completed step 3000 out of 25404
Completed step 4000 out of 25404
Completed step 5000 out of 25404
Completed step 6000 out of 25404
Completed step 7000 out of 25404
Completed step 8000 out of 25404
Completed step 9000 out of 25404
Completed step 10000 out of 25404
Completed step 11000 out of 25404
Completed step 12000 out of 25404
Completed step 13000 out of 25404
Completed step 14000 out of 25404
Completed step 15000 out of 25404
Completed step 16000 out of 25404
Completed step 17000 out of 25404
Completed step 18000 out of 25404
Completed step 19000 out of 25404
Completed step 20000 out of 25404
Completed step 21000 out of 25404
Completed step 22000 out of 25404
Completed step 23000 out of 25404
Completed step 24000 out of 25404
Completed step 25000 out of 25404
Wall time: 37min 56s


In [100]:
ext_job_title_embeddings[:2]

[(0,
  'Technician I',
  array([ 7.95781706e-03, -3.90867442e-02, -2.52043866e-02,  5.29772323e-03,
         -2.36905944e-02,  1.17308926e-02,  2.35404875e-02,  1.38915060e-02,
         -3.20590809e-02,  8.28195829e-04,  4.41534072e-02, -2.09397525e-02,
          2.34017405e-03,  7.22711906e-02,  2.48976238e-02,  2.49476405e-03,
         -3.54840327e-03,  1.29383495e-02, -1.91736594e-02, -5.85854426e-03,
          1.52683733e-02,  5.33449650e-02, -2.88587660e-02,  3.03384718e-02,
         -9.11546573e-02,  1.73421316e-02,  4.67448272e-02,  1.06221123e-03,
         -1.42873181e-02,  4.36828770e-02,  5.53772822e-02,  8.66331626e-03,
          1.84997935e-02,  5.71507625e-02,  1.68100814e-06,  3.41190100e-02,
          9.34779830e-03, -2.95979856e-03, -6.72660545e-02, -1.65763944e-02,
         -6.27182201e-02,  1.34818126e-02,  2.08702423e-02,  4.10781391e-02,
          4.53181472e-03,  2.97433231e-03,  3.35708112e-02,  3.84782674e-03,
          5.38140628e-03, -3.12295929e-02, -2.1775580

In [52]:
assert(ext_job_title_embeddings[1300][1] == df_pos.ExtJobTitleText.iloc[1300])

In [None]:
ext_job_title_embeddings_gibui = pickle.load(open('Transformers_pickles/ext_job_title_embeddings_desc.pkl', 'rb'))

In [53]:
%%time
rows_ext_job_title = []
for d in ext_job_title_embeddings:
    vector = []
    vector.append(d[0])
    vector.append(d[1])
    for item in d[2]:
        vector.append(item)
    rows_ext_job_title.append(vector)   

Wall time: 6.64 s


### Concatinating everything together

In [60]:
len(rows_ext_job_title[0])

770

In [94]:
len(rows[0])

770

In [101]:
descriptions_with_ExtJobTitleText_df = pd.DataFrame(rows_ext_job_title, columns = ['rows_ext_job_title','ExtJobTitleText'] + [f'embedding2_{i}' for i in range(768)])
descriptions_with_ExtJobTitleText_df.to_pickle('descriptions_with_ExtJobTitleText_df.pkl')

In [103]:
descriptions_with_ExtJobTitleText_df.head(2)

Unnamed: 0,rows_ext_job_title,ExtJobTitleText,embedding2_0,embedding2_1,embedding2_2,embedding2_3,embedding2_4,embedding2_5,embedding2_6,embedding2_7,...,embedding2_758,embedding2_759,embedding2_760,embedding2_761,embedding2_762,embedding2_763,embedding2_764,embedding2_765,embedding2_766,embedding2_767
0,0,Technician I,0.007958,-0.039087,-0.025204,0.005298,-0.023691,0.011731,0.02354,0.013892,...,-0.01884,0.00326,0.063682,0.062868,-0.00471,0.059577,0.003892,0.012452,-0.007044,-0.016285
1,1,"RN or LPN Clinic Nurse, Urology",0.014309,0.058382,-0.029101,-0.064406,0.015139,0.050526,0.0391,0.028852,...,-0.017312,0.027753,-0.000821,-0.015602,-0.006399,-0.018835,0.003042,0.012424,0.055522,-0.013705


In [107]:
descriptions_with_ExtJobTitleText_df = descriptions_with_ExtJobTitleText_df.set_index('rows_ext_job_title')

In [115]:
descriptions_with_ExtJobTitleText_df.head(2)

Unnamed: 0_level_0,ExtJobTitleText,embedding2_0,embedding2_1,embedding2_2,embedding2_3,embedding2_4,embedding2_5,embedding2_6,embedding2_7,embedding2_8,...,embedding2_758,embedding2_759,embedding2_760,embedding2_761,embedding2_762,embedding2_763,embedding2_764,embedding2_765,embedding2_766,embedding2_767
rows_ext_job_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Technician I,0.007958,-0.039087,-0.025204,0.005298,-0.023691,0.011731,0.02354,0.013892,-0.032059,...,-0.01884,0.00326,0.063682,0.062868,-0.00471,0.059577,0.003892,0.012452,-0.007044,-0.016285
1,"RN or LPN Clinic Nurse, Urology",0.014309,0.058382,-0.029101,-0.064406,0.015139,0.050526,0.0391,0.028852,0.018918,...,-0.017312,0.027753,-0.000821,-0.015602,-0.006399,-0.018835,0.003042,0.012424,0.055522,-0.013705


In [118]:
columns1 = [f'embedding2_{i}' for i in range(768)]

In [119]:
descriptions_with_ext = descriptions_with_sentence_embeddings_df.join(descriptions_with_ExtJobTitleText_df[columns1])

In [128]:
descriptions_with_ext = descriptions_with_ext.set_index('row_id').join(df_pos['JobTitle_tokenized'])

In [129]:
descriptions_with_ext.sample(3)

Unnamed: 0_level_0,description,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding2_759,embedding2_760,embedding2_761,embedding2_762,embedding2_763,embedding2_764,embedding2_765,embedding2_766,embedding2_767,JobTitle_tokenized
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1178,"At Corizon Health, we share a common bond to deliver extraordinary care and client service. We take pride in achieving excellence and honoring our responsibility to deliver safe, effective and efficient healthcare services that better our communi...",0.042775,0.03365,0.005054,-0.047619,-0.038533,0.087811,-0.032835,-0.007134,-0.003032,...,0.009367,-0.018443,0.015939,-0.00993,0.033735,0.030152,-0.003777,0.011917,-0.002766,37.0
6169,"Entry Level Accountant needed for a contract opportunity with Yoh’s client located in Davidson, NC.The Big Picture – Top Skills Should You Possess:Responsible for analyzing financial information and prepare financial reports to determine or maint...",-0.008369,0.052168,0.010882,-0.059119,0.019008,0.036149,0.001782,0.00033,-0.016669,...,-0.014962,-0.042643,-0.002026,0.001162,0.076365,0.003684,0.019822,0.039129,0.006079,153.0
333,"Envision Physician Services and Lawnwood Regional Medical Center are seeking hospitalists to become a part of our well established program in Fort Pierce, Florida. All candidates must reside in St. Lucie County.­ 331 Bed Hospital CLOSED ICU No pr...",-0.001417,-0.016857,0.008566,0.006584,0.003686,0.006942,-0.001357,-0.001182,-0.069176,...,0.011785,0.026147,0.015226,-0.027453,0.026922,0.020609,0.003587,0.006931,-0.006606,9.0


In [130]:
df_pos.iloc[6169]

ExtJobTitleText                                                                                                                                                                                                                                               Entry Level Accountant
JobTitle                                                                                                                                                                                                                                                                  Accountant
Description                Entry Level Accountant needed for a contract opportunity with Yoh’s client located in Davidson, NC.The Big Picture – Top Skills Should You Possess:Responsible for analyzing financial information and prepare financial reports to determine or maint...
JobTitle_tokenized                                                                                                                                                       

### Modedling with transformers sequence embeddings

In [67]:
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, GlobalAveragePooling1D, Dropout, Reshape, Activation

In [131]:
descriptions_with_ext.shape

(25404, 1538)

In [132]:
descriptions_with_ext = descriptions_with_ext[descriptions_with_ext['description'].notnull()]
descriptions_with_ext.dropna(inplace=True)

In [134]:
descriptions_with_ext.shape

(25403, 1538)

still there was one none

In [135]:
text_col, target_col = 'description', 'JobTitle_tokenized'

from sklearn.preprocessing import LabelBinarizer
label_as_binary = LabelBinarizer()

label_as_binary.fit(df_pos[target_col])

training1 = descriptions_with_ext.sample(frac=0.8, random_state=41)
test1 = descriptions_with_ext[~descriptions_with_ext.index.isin(training1.index)]

train__y_labels1 = label_as_binary.transform(training1[target_col])
test__y_labels1 = label_as_binary.transform(test1[target_col])

In [144]:
model1 = tf.keras.Sequential()
model1.add(Dense(4096, input_dim=768*2))
model1.add(Activation('elu'))
# model.add(Dense(2048, input_dim=2048))
# model.add(Activation('relu'))
model1.add(Dense(df_pos.JobTitle.nunique()))
model1.add(Activation('softmax'))
model1.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),'accuracy']
    )

In [145]:
model1.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 4096)              6295552   
                                                                 
 activation_12 (Activation)  (None, 4096)              0         
                                                                 
 dense_13 (Dense)            (None, 352)               1442144   
                                                                 
 activation_13 (Activation)  (None, 352)               0         
                                                                 
Total params: 7,737,696
Trainable params: 7,737,696
Non-trainable params: 0
_________________________________________________________________


In [149]:
f_vector3 = [f'embedding_{i}' for i in range(768)]
f_vector2 = [f'embedding2_{i}' for i in range(768)]
f_vector1 = f_vector3+f_vector2

In [151]:
history1 = model1.fit(training1[f_vector1], 
          train__y_labels1, 
          validation_data=(test1[f_vector1], test__y_labels1), 
          epochs=50,
          batch_size=1024,
          verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# Results

In [153]:
pd.read_csv('Results.csv', index_col='Unnamed: 0')

Unnamed: 0,precision,recall
SVM,0.78,0.8
Random Forest,0.58,0.52
SGD,0.76,0.75
Embedding_with_pooling,0.82,0.66
Embedding_concatinated,0.83,0.64
Transformer,0.84,0.72
Transformer with ext_job_title,0.85,0.78
