# Job title prediction with embedding

In [1]:
%load_ext autoreload
%autoreload 2
import sys; sys.path.append('../')                                                                                          

In [2]:
import pandas as pd
import numpy as np
import cufflinks as cf; cf.go_offline()

In [3]:
import tensorflow as tf
tf.__version__

'2.7.0'

In [4]:
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, GlobalAveragePooling1D, Dropout, Reshape, Activation

In [5]:
pd.set_option('max_colwidth',250)

## Loading the data

In [6]:
df_pos = pd.read_csv('datasets/data_clean.csv')
df_pos.sample(2)

Unnamed: 0,ExtJobTitleText,JobTitle,Description
25278,Seasonal Warehouse Associate - Milton,Warehouse Worker,NOW OFFERING A $500 SIGN-ON BONUS!!EARN UP TO $19.50 ON NIGHT SHIFTSAttend one of our upcoming walk-in hiring events! No appointment necessary!Walk-ins welcome—or apply online and then schedule an appointment that works for you. Hiring EventDate/...
5179,Assembler - Second Shift,Assembler,"Assemblers Needed – Entry Level - Day Shift –must be able to start at 5am - needed for a long term contract opportunity with our client located in Tempe, AZ What You Will Be Doing:Tube Assembly processors perform day to day production activities ..."


In [7]:
df_pos.shape

(25405, 3)

In [8]:
df_pos["JobTitle_tokenized"] = pd.factorize(df_pos.JobTitle)[0]

In [24]:
# df_pos["ext_job_title_tokenized"] = pd.factorize(df_pos.ExtJobTitleText)[0]

In [12]:
df_pos.JobTitle.value_counts().to_frame(name='count')

Unnamed: 0,count
Warehouse Worker,1000
Packager,1000
Pediatric Speech Language Pathologist,958
Retail Sales Representative,810
Registered Nurse (RN),701
...,...
Maintenance Planner,10
Hospital Admissions Coordinator,10
Finance Manager,10
Saw Operator,10


### Train and test set split

In [11]:
df_pos = df_pos[df_pos['Description'].notnull()]

In [18]:
text_col, target_col = 'Description', 'JobTitle'

from sklearn.preprocessing import LabelBinarizer
label_as_binary = LabelBinarizer()

label_as_binary.fit(df_pos[target_col])

training_set = df_pos[[text_col, target_col]].sample(frac=0.8, random_state=41)
test_set = df_pos[~df_pos.index.isin(training_set.index)][[text_col, target_col]]

train__y_labels = label_as_binary.transform(training_set[target_col])
test__y_labels = label_as_binary.transform(test_set[target_col])

In [19]:
assert(len(training_set) + len(test_set) == len(df_pos))

What is a good size for the sequence_length? 

In [34]:
df_pos.Description.apply(lambda x: len(x.split(' '))).quantile([0.5,0.6,0.7,0.8,0.9,0.95,0.99])

0.50    225.00
0.60    285.00
0.70    334.00
0.80    386.00
0.90    516.00
0.95    606.85
0.99    940.00
Name: Description, dtype: float64

What is a good size for the vocabulary? 

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(min_df=5).fit(df_pos['Description'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

16663


In [38]:
vocab_size = 16663
sequence_length = 516

# Use the text vectorization layer to normalize, split, and map strings to integers. Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    #standardize=lambda text: tf.strings.lower(text), # You can use your own normalization function here
    max_tokens=vocab_size,
    output_mode='int',
    name = 'Text_processing',
    output_sequence_length=sequence_length
)

In [41]:
vectorize_layer.adapt(training_set[text_col])

In [44]:
sample_description = training_set[text_col].sample().iloc[0]
print(sample_description)
vectorize_layer(sample_description)

Are you a Senior Proposal Writer who enjoys the challenge of writing proposals and working on a collaborative team? Are you looking for an opportunity to work with an established company that values its employee’s enthusiasm and technical contributions? If so, we want to talk to you! Our client has an exciting remote (IN THE USA) contract to hire opportunity for a Senior Proposal Writer! The ideal candidate is seeking challenging work and


<tf.Tensor: shape=(516,), dtype=int64, numpy=
array([  13,   10,    5, 1270, 1800, 4035,   37, 2644,    4, 2531,    6,
       1643, 3457,    2,   83,   23,    5,  674,   29,   13,   10,   41,
          7,   20,   40,    3,   17,   11,   20, 1025,   63,   22,  585,
        414, 5352, 7513,    2,  572,  541,   48,  115,   14,  515,    3,
       2520,    3,   10,    9,  293,  154,   20, 1935, 1653,    8,    4,
       4343,  419,    3,  240,   40,    7,    5, 1270, 1800, 4035,    4,
        709,  765,   12,  206, 2934,   17,    2,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

In [45]:
# for token in vectorize_layer(sample_description).numpy()[:20]:
#     print(f"{token} ---> ",vectorize_layer.get_vocabulary()[token])

### Modeling

### Descriptions squashed into 1 average embedding vector, size 16

In [None]:
embedding_dim1=32

model1 = tf.keras.Sequential([
    vectorize_layer1,
    Embedding(vocab_size1, embedding_dim1, name="embedding"),
    GlobalAveragePooling1D(),
#     Dropout(0.03),
    Dense(4096, activation='elu', name='hidden_layer'),
#     Dropout(0.01),
#     Dense(2048, activation='elu', name='hidden_layer2'),
#     Dropout(0.02),
#     Dense(1024, activation='relu', name='hidden_layer2'),
    Dense(df_pos1.JobTitle.nunique(), name = 'output_layer', activation='softmax')
])

In [None]:
model1.summary()

In [None]:
tf.keras.utils.plot_model(model1, show_dtype=True, show_shapes=True, show_layer_names=True)

In [None]:
model1.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics = ['accuracy'])

#### Creating checkpoints for model weights

In [47]:
# checkpoint_path = 'Deep_models_weights'
cp_callback = [tf.keras.callbacks.ModelCheckpoint(
    filepath='Deep_model_weights/model.{epoch:02d}-val_loss{val_loss:.3f}-val_precision{val_precision:.3f}-val_recall{val_recall:.3f}.tf', 
    verbose=1, 
    save_weights_only=True,
    save_freq= 'epoch')]

In [None]:
# from tensorflow import keras

# checkpoint_path = 'Deep_models_weights'
# callbacks  = [
#     keras.callbacks.ModelCheckpoint(
#         filepath=checkpoint_path, 
#         monitor='val_loss',
#         verbose=1,
#         save_best_only=True,
#         save_weights_only=True,
#         save_freq='epoch'),
#     keras.callbacks.EarlyStopping(
#         monitor='val_recall',
#         min_delta=0,
#         patience=20,
#         verbose=1)
#     ]

#### Model fit

In [None]:
%%time
history = model.fit(
    training_set[text_col],
    train__y_labels,
    epochs=10,
    batch_size=1024,
    verbose=1,
    callbacks=cp_callback,
    validation_data = (test_set[text_col], test__y_labels)
)

### !!!! ADD OR DELETE - Descriptions words concatinated

In [None]:
embedding_dim=32

model2 = tf.keras.Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name="embedding"),
#     GlobalAveragePooling1D(),
    Reshape((embedding_dim * sequence_length, ), name='concat_words'),
#     Dropout(0.1),
    Dense(4096, activation='relu', name='hidden_layer_1'),
#     Dropout(0.04),
#     Dense(2048, activation='relu', name='hidden_layer_2'),
    Dense(df_pos.JobTitle.nunique(), name = 'output_layer')
])

In [None]:
model2.summary()

In [None]:
tf.keras.utils.plot_model(model2, show_dtype=True, show_shapes=True, show_layer_names=True)

In [None]:
model2.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics = ['accuracy'])

In [None]:
%%time
history = model2.fit(
    training_set[text_col],
    train__y_labels,
    epochs=10,
    batch_size=1024,
    verbose=1,    
    validation_data = (test_set[text_col], test__y_labels)
)

In [None]:
test_set['token_with_best_prediction'] = model.predict(test_set[text_col]).argmax(axis=1)

In [None]:
test_set['prob_token_with_best_prediction'] = model.predict(test_set[text_col]).max(axis=1)

In [None]:
test_set.head(20)

### !!!! ADD OR DELETE - Adding additional features (besides text)

#### Extracting Year column

In [None]:
from dateutil.parser import parse
def extract_year_from_title(title):
    try:
        year = parse(title, fuzzy=True).year
        return str(int(year)) if year > 1800 else None
    except:
        return None

In [None]:
sample_title = wine_reviews.sample().title.iloc[0]
print(f'Title is: {sample_title}. Extracted year: {extract_year_from_title(sample_title)}')

In [None]:
wine_reviews['year'] = wine_reviews.title.apply(extract_year_from_title)
wine_reviews['year'].value_counts(dropna=False).head(10)

Is the year input informative? 

In [None]:
wine_reviews.groupby('year').points.describe().query('count > 20').sort_values(by='mean',ascending=False).head()

#### Preparing the input features

In [None]:
wine_reviews = wine_reviews.reset_index() # To ensure correctness with the below join operations

In [None]:
description_tokens = vectorize_layer(wine_reviews[text_col])

In [None]:
description_cols = [f'w_{i}' for i in range(1, description_tokens.shape[1] + 1)]
features_df = pd.DataFrame(description_tokens.numpy(), columns=description_cols)

In [None]:
features_df = features_df.join(wine_reviews[['points','price','country','year','variety','province']])
features_df.head()

In [None]:
features_df[categorical_featurs] = features_df[categorical_featurs].fillna('Unknown')
features_df.price = features_df.price.fillna(features_df.price.mean())

In [None]:
features_df.country = pd.factorize(features_df.country)[0]
features_df.year = pd.factorize(features_df.year)[0]
features_df.variety = pd.factorize(features_df.variety)[0]
features_df.province = pd.factorize(features_df.province)[0]
features_df.year = pd.factorize(features_df.year)[0]

In [None]:
features_df.head()

In [None]:
features_df[categorical_featurs].apply(lambda x: pd.Series({'nunique': x.nunique(),
                                                            'max': x.max(),
                                                            'min': x.min()}))

In [None]:
from tensorflow.keras.layers import Input
from tensorflow.keras import layers, Model

In [None]:
description_input = Input(
    shape=(sequence_length,), dtype='int64', name='description'
)

year_input = Input(
    shape=(1,), name="year", dtype='int64'
)  

country_input = Input(
    shape=(1,), name="country", dtype='int64'
)  

province_input = Input(
    shape=(1,), name="province", dtype='int64'
)

variety_input = Input(
    shape=(1,), name="variety", dtype='int64'
)

price_input = Input(
    shape=(1,), name="price",
)

word_features = layers.Embedding(vocab_size, embedding_dim, input_length=sequence_length, name='word_embeddings')(description_input)
word_features = layers.Reshape((embedding_dim * sequence_length,), name='concat_words')(word_features)

year_features = layers.Embedding(100, 3, name='year_embeddings')(year_input)
year_features = layers.Reshape((3,), name='concat_year')(year_features)

country_features = layers.Embedding(50, 2, name='country_embeddings')(country_input)
country_features = layers.Reshape((2,), name='concat_country')(country_features)

province_features = layers.Embedding(500, 5, name='province_embeddings')(province_input)
province_features = layers.Reshape((5,), name='concat_province')(province_features)

variety_features = layers.Embedding(1000, 4, name='variety_embeddings')(variety_input)
variety_features = layers.Reshape((4,), name='concat_variety')(variety_features)

# Merge all available features into a single large vector via concatenation
feature_vector = layers.concatenate([word_features, year_features, country_features, province_features, variety_features, price_input])
x = layers.Dropout(0.2)(feature_vector)
x = layers.Dense(256, activation='relu', name='Hidden')(x)
# Outputs:
predictions = layers.Dense(1, name="output")(x)

# Instantiate an end-to-end model predicting E,I,O:
model = Model(
    inputs=[description_input, year_input, country_input, province_input, variety_input, price_input],
    outputs=predictions,
)

In [None]:
tf.keras.utils.plot_model(model, show_dtype=True, show_shapes=True, show_layer_names=True)

In [None]:
model.summary()

In [None]:
training_set = features_df.sample(frac=0.8, random_state=42)
test_set = features_df[~features_df.index.isin(training_set.index)]

In [None]:
assert(len(training_set) + len(test_set) == len(wine_reviews))

In [None]:
model.compile(
    optimizer=tf.optimizers.Adam(),
    loss='mean_absolute_error')

In [None]:
%%time
history = model.fit(
    {"description": training_set[description_cols].values, 
     "year": training_set['year'].values,
     "country": training_set['country'].values,
     "province": training_set['province'].values,
     "variety": training_set['variety'].values, 
     'price': training_set['price'].values},
    
    {"output": training_set['points'].values},
    validation_data=([test_set[description_cols].values, 
                      test_set['year'].values, 
                      test_set['country'].values, 
                      test_set['province'].values, 
                      test_set['variety'].values, 
                      test_set['price'].values],
                     test_set['points'].values),
    epochs=10,
    batch_size=512,
    verbose=1)

In [None]:
history.history

In [None]:
test_set['dnn_prediction'] = model.predict({'description': test_set[description_cols], 
                                            'year': test_set['year'], 
                                            'country': test_set['country'], 
                                            'province': test_set['province'], 
                                            'variety': test_set['variety'], 
                                            'price': test_set['price']})

In [None]:
calc_prediction_quality(test_set, 'dnn_prediction', target_col)

## Using pretrained embeddings

In [13]:
%%capture
%pip install sentence-transformers

In [14]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [16]:
import pickle

### description_embeddings

In [None]:
description_vectors_path = 'Transformers_pickles/descriptions_embeddings_desc.pkl'

In [None]:
%%time
description_embeddings = []
for i,description in enumerate(df_pos[text_col].values):
    description_embeddings.append((i,description, model.encode(description)))
    if (i+1) % 1000 == 0:
        print(f'Completed step {i+1} out of {len(df_pos)}')
        pickle.dump(description_embeddings, open(description_vectors_path, 'wb'))
pickle.dump(description_embeddings, open(description_vectors_path, 'wb'))

In [None]:
assert(description_embeddings[1300][1] == df_pos.Description.iloc[1300])

In [None]:
import pickle
descriptions = pickle.load(open('descriptions_embeddings_desc.pkl', 'rb'))

In [None]:
%%time
rows = []
for d in descriptions:
    vector = []
    vector.append(d[0])
    vector.append(d[1])
    for item in d[2]:
        vector.append(item)
    rows.append(vector)   

### ExtJobTitleText_embeddings

In [None]:
ext_job_title_vectors_path = 'Transformers_pickles/ext_job_title_embeddings_desc.pkl'

In [None]:
%%time
ext_job_title_embeddings = []
for i,description in enumerate(df_pos['ExtJobTitleText'].values):
    ext_job_title_embeddings.append((i,description, model.encode(description)))
    if (i+1) % 1000 == 0:
        print(f'Completed step {i+1} out of {len(df_pos)}')
        pickle.dump(ext_job_title_embeddings, open(ext_job_title_vectors_path, 'wb'))
pickle.dump(ext_job_title_embeddings, open(ext_job_title_vectors_path, 'wb'))

In [None]:
assert(description_embeddings[1300][1] == df_pos.Description.iloc[1300])

In [None]:
import pickle
descriptions = pickle.load(open('ext_job_title_embeddings_desc.pkl', 'rb'))

In [None]:
%%time
rows_ext_job_title = []
for d in descriptions:
    vector = []
    vector.append(d[0])
    vector.append(d[1])
    for item in d[2]:
        vector.append(item)
    rows_ext_job_title.append(vector)   

### Concatinating everything together

* have to concat everything

In [None]:
descriptions_with_sentence_embeddings_df = pd.DataFrame(rows, columns = ['row_id','description'] + [f'embedding_{i}' for i in range(768)])
descriptions_with_sentence_embeddings_df.to_pickle('descriptions_with_sentence_embeddings_df.pkl')

In [None]:
descriptions_with_sentence_embeddings_df

In [None]:
descriptions_with_sentence_embeddings_df = descriptions_with_sentence_embeddings_df.set_index('row_id').join(df_pos['JobTitle_tokenized'])

In [None]:
descriptions_with_sentence_embeddings_df.sample(3)

In [None]:
s = descriptions_with_sentence_embeddings_df[['description','JobTitle_tokenized']].sample()
s.values

In [None]:
descriptions_with_sentence_embeddings_df.query('description == @s.description.iloc[0]')['JobTitle_tokenized']

### Modedling with transformers sequence embeddings

In [None]:
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, GlobalAveragePooling1D, Dropout, Reshape, Activation

In [None]:
text_col, target_col = 'Desc_concatinated', 'JobTitle_tokenized'

from sklearn.preprocessing import LabelBinarizer
label_as_binary = LabelBinarizer()

label_as_binary.fit(df_pos[target_col])

training = descriptions_with_sentence_embeddings_df.sample(frac=0.8, random_state=41)
test = descriptions_with_sentence_embeddings_df[~descriptions_with_sentence_embeddings_df.index.isin(training_set.index)]

train__y_labels = label_as_binary.transform(training[target_col])
test__y_labels = label_as_binary.transform(test[target_col])

In [None]:
model = tf.keras.Sequential()
model.add(Dense(4096, input_dim=768))
model.add(Activation('elu'))
# model.add(Dense(2048, input_dim=2048))
# model.add(Activation('relu'))
model.add(Dense(df_pos.JobTitle.nunique()))
model.add(Activation('softmax'))
model.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
f_vector = [f'embedding_{i}' for i in range(768)]
history = model.fit(training[f_vector], 
          train__y_labels, 
          validation_data=(test[f_vector], test__y_labels), 
          epochs=7,
          batch_size=1024,
          verbose=1)

### Simple NN Prediction & Evaluation

In [None]:
import matplotlib.pyplot as plt

In [None]:
pd.DataFrame(history.history).plot()

In [None]:
test['token_with_best_prediction'] = model.predict(test[f_vector]).argmax(axis=1)

In [None]:
test['prob_token_with_best_prediction'] = model.predict(test[f_vector]).max(axis=1)

In [None]:
test.sample()

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
print(precision_score(test['JobTitle_tokenized'],test['token_with_best_prediction'], average="macro"))
print(recall_score(test['JobTitle_tokenized'],test['token_with_best_prediction'], average="macro"))    
print(accuracy_score(test['JobTitle_tokenized'],test['token_with_best_prediction']))    

### Making a loop to see the best metrics

In [None]:
f_vector = [f'embedding_{i}' for i in range(768)]
transformer_results2 = [0]*100
for i in range(100):
    history_loop = model.fit(training[f_vector], 
                              train__y_labels, 
                              validation_data=(test[f_vector], test__y_labels), 
                              epochs=1,
                              batch_size=1024,
                              verbose=1)
    
    prob = model.predict(test[f_vector])
    test['token_with_best_prediction'] = prob.argmax(axis=1)
#     test['prob_token_with_best_prediction'] = prob.max(axis=1)
    
    transformer_results2[i] = [i,
                            precision_score(test['JobTitle_tokenized'],test['token_with_best_prediction'], average="weighted", zero_division=0),
                            recall_score(test['JobTitle_tokenized'],test['token_with_best_prediction'], average="weighted", zero_division=0), 
                            accuracy_score(test['JobTitle_tokenized'],test['token_with_best_prediction'])]

In [None]:
transformer_results2

In [None]:
pd.DataFrame(transformer_results2, columns=['epoch','val_precision','val_re-call','val_accuracy']).set_index('epoch')[['val_precision','val_re-call']].plot()
[0.673, 0.687]