# Job title prediction with embedding

In [1]:
%load_ext autoreload
%autoreload 2
import sys; sys.path.append('../')                                                                                          

In [2]:
import pandas as pd
import numpy as np
import cufflinks as cf; cf.go_offline()

In [3]:
import tensorflow as tf
tf.__version__

'2.7.0'

In [4]:
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, GlobalAveragePooling1D, Dropout, Reshape, Activation

In [5]:
pd.set_option('max_colwidth',250)

## Loading the data

In [6]:
df_pos = pd.read_csv('datasets/data_clean.csv')
df_pos.sample(2)

Unnamed: 0,ExtJobTitleText,JobTitle,Description
12527,Retail Associate,Store Worker,Are you an upbeat and positive individual looking to find work in a fun and dynamic environment? Do you enjoy talking to customers and helping to meet business goals? PeopleReady is looking for committed Retail Associates to start immediately for...
1364,Physician-Staff,Nurse Practitioner (NP),"At Corizon Health, we share a common bond to deliver extraordinary care and client service. We take pride in achieving excellence and honoring our responsibility to deliver safe, effective and efficient healthcare services that better our communi..."


In [7]:
df_pos.shape

(25405, 3)

In [8]:
df_pos["JobTitle_tokenized"] = pd.factorize(df_pos.JobTitle)[0]

In [9]:
# df_pos["ext_job_title_tokenized"] = pd.factorize(df_pos.ExtJobTitleText)[0]

In [10]:
df_pos.JobTitle.value_counts().to_frame(name='count')

Unnamed: 0,count
Warehouse Worker,1000
Packager,1000
Pediatric Speech Language Pathologist,958
Retail Sales Representative,810
Registered Nurse (RN),701
...,...
Maintenance Planner,10
Hospital Admissions Coordinator,10
Finance Manager,10
Saw Operator,10


### Train and test set split

In [11]:
df_pos = df_pos[df_pos['Description'].notnull()]

In [12]:
text_col, target_col = 'Description', 'JobTitle'

from sklearn.preprocessing import LabelBinarizer
label_as_binary = LabelBinarizer()

label_as_binary.fit(df_pos[target_col])

training_set = df_pos[[text_col, target_col]].sample(frac=0.8, random_state=41)
test_set = df_pos[~df_pos.index.isin(training_set.index)][[text_col, target_col]]

train__y_labels = label_as_binary.transform(training_set[target_col])
test__y_labels = label_as_binary.transform(test_set[target_col])

In [13]:
assert(len(training_set) + len(test_set) == len(df_pos))

## Embedding with pooling and all words

What is a good size for the sequence_length? 

In [14]:
df_pos.Description.apply(lambda x: len(x.split(' '))).quantile([0.5,0.6,0.7,0.8,0.9,0.95,0.99])

0.50    225.00
0.60    285.00
0.70    334.00
0.80    386.00
0.90    516.00
0.95    607.00
0.99    942.94
Name: Description, dtype: float64

What is a good size for the vocabulary? 

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer().fit(df_pos['Description'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

57131


In [16]:
vocab_size = 50000
sequence_length = 600

# Use the text vectorization layer to normalize, split, and map strings to integers. Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    #standardize=lambda text: tf.strings.lower(text), # You can use your own normalization function here
    max_tokens=vocab_size,
    output_mode='int',
    name = 'Text_processing',
    output_sequence_length=sequence_length
)

In [17]:
vectorize_layer.adapt(training_set[text_col])

In [18]:
sample_description = training_set[text_col].sample().iloc[0]
print(sample_description)
vectorize_layer(sample_description)

Auto req ID: 248868BRJob SummaryFedEx Ground is an essential business that needs people to help us support the economy, handling life-saving medications and other items that keep our communities as prepared as possible during these uncertain times.FedEx Ground will continue to hire for essential positions like this one.FedEx Ground is hiring part-time and full-time individuals to load and unload packages in our fast-paced warehouse environment. Part-time employees typically work a 2-4-hour shift per day. Full-time employees work approximately two shifts per day of varying lengths. Package Handlers are responsible for warehouse duties including: the physical loading, unloading and/or sorting of packages of varying sizes and weights by hand, including lifting, pushing, pulling, carrying, scanning, placing packages, as well as physical bending, twisting, kneeling and etc. in a safe and efficient manner. Shifts may vary depending on warehouse package volume and business needs.Package Handl

<tf.Tensor: shape=(600,), dtype=int64, numpy=
array([  469,   713,   106,     1,   777,   134,    12,    20,   194,
          74,    22,   136,    72,     3,    59,    61,    85,     4,
         782,   343,   781,   591,     2,    45,   346,    22,   327,
           9,   392,    16,   620,    16,   562,   127,   330,   742,
         786,   134,    19,   416,     3,   237,     7,   194,   316,
         131,    27,   787,   134,    12,    67,   124,     2,    84,
         312,     3,   577,     2,   606,   200,     8,     9,   344,
         151,    60,   124,   110,   604,    17,     5,  1157,    89,
          75,    36,    84,   110,    17,   747,   541,   123,    75,
          36,     6,   571,  1186,   238,   548,    13,   211,     7,
         151,   140,    39,     4,   174,   752,   636,   149,   947,
           6,   200,     6,   571,  1040,     2,  1154,    25,   607,
          39,   867,  1145,   746,  1028,   784,  1067,   200,    16,
         126,    16,   174,  1010,  1169,  1

In [19]:
# for token in vectorize_layer(sample_description).numpy()[:20]:
#     print(f"{token} ---> ",vectorize_layer.get_vocabulary()[token])

### Modeling

### Descriptions squashed into 1 average embedding vector, size 16

In [20]:
embedding_dim=32

model = tf.keras.Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name="embedding"),
    GlobalAveragePooling1D(),
#     Dropout(0.03),
    Dense(1024, activation='elu', name='hidden_layer'),
#     Dropout(0.01),
#     Dense(2048, activation='elu', name='hidden_layer2'),
#     Dropout(0.02),
#     Dense(1024, activation='relu', name='hidden_layer2'),
    Dense(df_pos.JobTitle.nunique(), name = 'output_layer', activation='softmax')
])

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Text_processing (TextVector  (None, 600)              0         
 ization)                                                        
                                                                 
 embedding (Embedding)       (None, 600, 32)           1600000   
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 hidden_layer (Dense)        (None, 1024)              33792     
                                                                 
 output_layer (Dense)        (None, 352)               360800    
                                                                 
Total params: 1,994,592
Trainable params: 1,994,592
Non-

In [22]:
tf.keras.utils.plot_model(model, show_dtype=True, show_shapes=True, show_layer_names=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [23]:
model.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),'accuracy']
    )

#### Creating checkpoints for model weights

In [24]:
# checkpoint_path = 'Deep_models_weights'
cp_callback = [tf.keras.callbacks.ModelCheckpoint(
    filepath='Deep_model_weights/model.{epoch:02d}-val_loss{val_loss:.3f}-val_precision{val_precision:.3f}-val_recall{val_recall:.3f}.tf', 
    verbose=1, 
    save_weights_only=True,
    save_freq='epoch')]

In [25]:
# from tensorflow import keras

# checkpoint_path = 'Deep_models_weights'
# callbacks  = [
#     keras.callbacks.ModelCheckpoint(
#         filepath=checkpoint_path, 
#         monitor='val_loss',
#         verbose=1,
#         save_best_only=True,
#         save_weights_only=True,
#         save_freq='epoch'),
#     keras.callbacks.EarlyStopping(
#         monitor='val_recall',
#         min_delta=0,
#         patience=20,
#         verbose=1)
#     ]

#### Model fit

In [26]:
%%time
history = model.fit(
    training_set[text_col],
    train__y_labels,
    epochs=200,
    batch_size=1024,
    verbose=1,
    callbacks=cp_callback,
    validation_data = (test_set[text_col], test__y_labels)
)

Epoch 1/200
Epoch 00001: saving model to Deep_model_weights\model.01-val_loss5.219-val_precision0.000-val_recall0.000.tf
Epoch 2/200
Epoch 00002: saving model to Deep_model_weights\model.02-val_loss4.949-val_precision0.000-val_recall0.000.tf
Epoch 3/200
Epoch 00003: saving model to Deep_model_weights\model.03-val_loss4.895-val_precision0.000-val_recall0.000.tf
Epoch 4/200
Epoch 00004: saving model to Deep_model_weights\model.04-val_loss4.841-val_precision0.000-val_recall0.000.tf
Epoch 5/200
Epoch 00005: saving model to Deep_model_weights\model.05-val_loss4.750-val_precision0.000-val_recall0.000.tf
Epoch 6/200
Epoch 00006: saving model to Deep_model_weights\model.06-val_loss4.621-val_precision1.000-val_recall0.019.tf
Epoch 7/200
Epoch 00007: saving model to Deep_model_weights\model.07-val_loss4.516-val_precision1.000-val_recall0.035.tf
Epoch 8/200
Epoch 00008: saving model to Deep_model_weights\model.08-val_loss4.431-val_precision1.000-val_recall0.035.tf
Epoch 9/200
Epoch 00009: saving 