# Job title prediction with embedding

In [47]:
%load_ext autoreload
%autoreload 2
import sys; sys.path.append('../')                                                                                          

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
import pandas as pd
import numpy as np
import cufflinks as cf; cf.go_offline()

In [49]:
import tensorflow as tf
tf.__version__

'2.7.0'

In [50]:
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, GlobalAveragePooling1D, Dropout, Reshape, Activation

In [51]:
pd.set_option('max_colwidth',250)

## Loading the data

In [52]:
df_pos = pd.read_csv('datasets/data_clean.csv')
df_pos.sample(2)

Unnamed: 0,ExtJobTitleText,JobTitle,Description
15045,Waste Collection / Garbage Truck Helper,Sanitation Worker,Waste Collection / Garbage Truck Helper gather garbage and other discarded materials set out by customers along designated routes in urban and rural communities and transport the materials to sanitary landfills or incinerator plants for disposal....
1698,Nurse Practitioner - Hourly,Nurse Practitioner (NP),"Corizon Health is the pioneer provider of correctional healthcare in the United States. We are a company built on more than 40-years of innovation and experience in the industry. Our people, our practices and our commitment to success are the tr..."


In [53]:
df_pos.shape

(25405, 3)

In [54]:
df_pos["JobTitle_tokenized"] = pd.factorize(df_pos.JobTitle)[0]

In [55]:
# df_pos["ext_job_title_tokenized"] = pd.factorize(df_pos.ExtJobTitleText)[0]

In [56]:
df_pos.JobTitle.value_counts().to_frame(name='count')

Unnamed: 0,count
Warehouse Worker,1000
Packager,1000
Pediatric Speech Language Pathologist,958
Retail Sales Representative,810
Registered Nurse (RN),701
...,...
Maintenance Planner,10
Hospital Admissions Coordinator,10
Finance Manager,10
Saw Operator,10


### Train and test set split

In [57]:
df_pos = df_pos[df_pos['Description'].notnull()]

In [58]:
text_col, target_col = 'Description', 'JobTitle'

from sklearn.preprocessing import LabelBinarizer
label_as_binary = LabelBinarizer()

label_as_binary.fit(df_pos[target_col])

training_set = df_pos[[text_col, target_col]].sample(frac=0.8, random_state=41)
test_set = df_pos[~df_pos.index.isin(training_set.index)][[text_col, target_col]]

train__y_labels = label_as_binary.transform(training_set[target_col])
test__y_labels = label_as_binary.transform(test_set[target_col])

In [59]:
assert(len(training_set) + len(test_set) == len(df_pos))

## Embedding with pooling and all words

What is a good size for the sequence_length? 

In [60]:
df_pos.Description.apply(lambda x: len(x.split(' '))).quantile([0.5,0.6,0.7,0.8,0.9,0.95,0.99])

0.50    225.00
0.60    285.00
0.70    334.00
0.80    386.00
0.90    516.00
0.95    607.00
0.99    942.94
Name: Description, dtype: float64

What is a good size for the vocabulary? 

In [70]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer().fit(df_pos['Description'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

57131


In [71]:
vocab_size = 50000
sequence_length = 516

# Use the text vectorization layer to normalize, split, and map strings to integers. Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    #standardize=lambda text: tf.strings.lower(text), # You can use your own normalization function here
    max_tokens=vocab_size,
    output_mode='int',
    name = 'Text_processing',
    output_sequence_length=sequence_length
)

In [72]:
vectorize_layer.adapt(training_set[text_col])

In [73]:
sample_description = training_set[text_col].sample().iloc[0]
print(sample_description)
vectorize_layer(sample_description)

PPG: We protect and beautify the world. At PPG, we work every day to develop and deliver the paints, coatings and materials that our customers have trusted for more than 130 years. Through dedication and creativity, we solve our customers’ biggest challenges, collaborating closely to find the right path forward. With headquarters in Pittsburgh, we operate and innovate in more than 70 countries. We serve customers in construction, consumer products, industrial and transportation markets and aftermarkets. To learn more, visit www.ppg.com and follow @PPG on Twitter.About YouDoes the thought of sitting still all day make you want to scream? Are you a people person? Are you passionate about learning new things and sharing your new acquired knowledge with others?If you screamed “YES!” to each of those questions, you may be just who we are looking for to help our team “protect and beautify the world!”As a Sales Associate, you will be the reason our customers come back! They will be excited to

<tf.Tensor: shape=(516,), dtype=int64, numpy=
array([  105,    14,   821,     2,   580,     4,   186,    26,   105,
          14,    17,    99,    36,     3,   217,     2,   398,     4,
        1099,   952,     2,   291,    22,     9,    57,    30,   933,
           7,    35,   203,  1030,    44,    87,   405,     2,  1063,
          14,   641,     9,   445,  1142,   384,   967,   744,     3,
         167,     4,   350,   798,   120,    11,  1087,     8,  1077,
          14,   434,     2,  1102,     8,    35,   203,   986,   804,
          14,   511,    57,     8,   395,   972,    94,   453,     2,
         599,   841,     2,  1101,     3,   166,    35,   669,  1176,
           2,   527,   105,    23,  6596,  8227,     4,  1111,     6,
         978,  1397,    32,    36,    42,    10,   512,     3,  1542,
          13,    10,     5,    72,   423,    13,    10,   347,    81,
         464,    77,   396,     2,  1407,    21,    77,  1531,   234,
          11, 15574,    10,  1248,  1280,   

In [None]:
# for token in vectorize_layer(sample_description).numpy()[:20]:
#     print(f"{token} ---> ",vectorize_layer.get_vocabulary()[token])

### Modeling

### Descriptions squashed into 1 average embedding vector, size 16

In [79]:
embedding_dim=32

model = tf.keras.Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name="embedding"),
    GlobalAveragePooling1D(),
#     Dropout(0.03),
    Dense(1024, activation='elu', name='hidden_layer'),
#     Dropout(0.01),
#     Dense(2048, activation='elu', name='hidden_layer2'),
#     Dropout(0.02),
#     Dense(1024, activation='relu', name='hidden_layer2'),
    Dense(df_pos.JobTitle.nunique(), name = 'output_layer', activation='softmax')
])

In [82]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Text_processing (TextVector  (None, 516)              0         
 ization)                                                        
                                                                 
 embedding (Embedding)       (None, 516, 32)           1600000   
                                                                 
 global_average_pooling1d_1   (None, 32)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 hidden_layer (Dense)        (None, 1024)              33792     
                                                                 
 output_layer (Dense)        (None, 352)               360800    
                                                                 
Total params: 1,994,592
Trainable params: 1,994,592
No

In [None]:
tf.keras.utils.plot_model(model1, show_dtype=True, show_shapes=True, show_layer_names=True)

In [85]:
model.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),'accuracy']
    )

#### Creating checkpoints for model weights

In [91]:
# checkpoint_path = 'Deep_models_weights'
cp_callback = [tf.keras.callbacks.ModelCheckpoint(
    filepath='Deep_model_weights/model.{epoch:02d}-val_loss{val_loss:.3f}-val_precision{val_precision:.3f}-val_recall{val_recall:.3f}.tf', 
    verbose=1, 
    save_weights_only=True,
    save_freq='epoch')]

In [None]:
# from tensorflow import keras

# checkpoint_path = 'Deep_models_weights'
# callbacks  = [
#     keras.callbacks.ModelCheckpoint(
#         filepath=checkpoint_path, 
#         monitor='val_loss',
#         verbose=1,
#         save_best_only=True,
#         save_weights_only=True,
#         save_freq='epoch'),
#     keras.callbacks.EarlyStopping(
#         monitor='val_recall',
#         min_delta=0,
#         patience=20,
#         verbose=1)
#     ]

#### Model fit

In [92]:
%%time
history = model.fit(
    training_set[text_col],
    train__y_labels,
    epochs=50,
    batch_size=1024,
    verbose=1,
    callbacks=cp_callback,
    validation_data = (test_set[text_col], test__y_labels)
)

Epoch 1/50
Epoch 00001: saving model to Deep_model_weights\model.01-val_loss3.914-val_precision0.991-val_recall0.066.tf
Epoch 2/50
Epoch 00002: saving model to Deep_model_weights\model.02-val_loss3.801-val_precision0.992-val_recall0.072.tf
Epoch 3/50
Epoch 00003: saving model to Deep_model_weights\model.03-val_loss3.688-val_precision0.992-val_recall0.078.tf
Epoch 4/50
Epoch 00004: saving model to Deep_model_weights\model.04-val_loss3.585-val_precision0.969-val_recall0.087.tf
Epoch 5/50
Epoch 00005: saving model to Deep_model_weights\model.05-val_loss3.485-val_precision0.961-val_recall0.088.tf
Epoch 6/50
Epoch 00006: saving model to Deep_model_weights\model.06-val_loss3.400-val_precision0.916-val_recall0.114.tf
Epoch 7/50
Epoch 00007: saving model to Deep_model_weights\model.07-val_loss3.317-val_precision0.924-val_recall0.131.tf
Epoch 8/50
Epoch 00008: saving model to Deep_model_weights\model.08-val_loss3.247-val_precision0.882-val_recall0.142.tf
Epoch 9/50
Epoch 00009: saving model to 

### !!!! ADD OR DELETE - Descriptions words concatinated

In [None]:
embedding_dim=32

model2 = tf.keras.Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name="embedding"),
#     GlobalAveragePooling1D(),
    Reshape((embedding_dim * sequence_length, ), name='concat_words'),
#     Dropout(0.1),
    Dense(4096, activation='relu', name='hidden_layer_1'),
#     Dropout(0.04),
#     Dense(2048, activation='relu', name='hidden_layer_2'),
    Dense(df_pos.JobTitle.nunique(), name = 'output_layer')
])

In [None]:
model2.summary()

In [None]:
tf.keras.utils.plot_model(model2, show_dtype=True, show_shapes=True, show_layer_names=True)

In [None]:
model2.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics = ['accuracy'])

In [None]:
%%time
history = model2.fit(
    training_set[text_col],
    train__y_labels,
    epochs=10,
    batch_size=1024,
    verbose=1,    
    validation_data = (test_set[text_col], test__y_labels)
)

In [None]:
test_set['token_with_best_prediction'] = model.predict(test_set[text_col]).argmax(axis=1)

In [None]:
test_set['prob_token_with_best_prediction'] = model.predict(test_set[text_col]).max(axis=1)

In [None]:
test_set.head(20)

### !!!! ADD OR DELETE - Adding additional features (besides text)

#### Extracting Year column

In [None]:
from dateutil.parser import parse
def extract_year_from_title(title):
    try:
        year = parse(title, fuzzy=True).year
        return str(int(year)) if year > 1800 else None
    except:
        return None

In [None]:
sample_title = wine_reviews.sample().title.iloc[0]
print(f'Title is: {sample_title}. Extracted year: {extract_year_from_title(sample_title)}')

In [None]:
wine_reviews['year'] = wine_reviews.title.apply(extract_year_from_title)
wine_reviews['year'].value_counts(dropna=False).head(10)

Is the year input informative? 

In [None]:
wine_reviews.groupby('year').points.describe().query('count > 20').sort_values(by='mean',ascending=False).head()

#### Preparing the input features

In [None]:
wine_reviews = wine_reviews.reset_index() # To ensure correctness with the below join operations

In [None]:
description_tokens = vectorize_layer(wine_reviews[text_col])

In [None]:
description_cols = [f'w_{i}' for i in range(1, description_tokens.shape[1] + 1)]
features_df = pd.DataFrame(description_tokens.numpy(), columns=description_cols)

In [None]:
features_df = features_df.join(wine_reviews[['points','price','country','year','variety','province']])
features_df.head()

In [None]:
features_df[categorical_featurs] = features_df[categorical_featurs].fillna('Unknown')
features_df.price = features_df.price.fillna(features_df.price.mean())

In [None]:
features_df.country = pd.factorize(features_df.country)[0]
features_df.year = pd.factorize(features_df.year)[0]
features_df.variety = pd.factorize(features_df.variety)[0]
features_df.province = pd.factorize(features_df.province)[0]
features_df.year = pd.factorize(features_df.year)[0]

In [None]:
features_df.head()

In [None]:
features_df[categorical_featurs].apply(lambda x: pd.Series({'nunique': x.nunique(),
                                                            'max': x.max(),
                                                            'min': x.min()}))

In [None]:
from tensorflow.keras.layers import Input
from tensorflow.keras import layers, Model

In [None]:
description_input = Input(
    shape=(sequence_length,), dtype='int64', name='description'
)

year_input = Input(
    shape=(1,), name="year", dtype='int64'
)  

country_input = Input(
    shape=(1,), name="country", dtype='int64'
)  

province_input = Input(
    shape=(1,), name="province", dtype='int64'
)

variety_input = Input(
    shape=(1,), name="variety", dtype='int64'
)

price_input = Input(
    shape=(1,), name="price",
)

word_features = layers.Embedding(vocab_size, embedding_dim, input_length=sequence_length, name='word_embeddings')(description_input)
word_features = layers.Reshape((embedding_dim * sequence_length,), name='concat_words')(word_features)

year_features = layers.Embedding(100, 3, name='year_embeddings')(year_input)
year_features = layers.Reshape((3,), name='concat_year')(year_features)

country_features = layers.Embedding(50, 2, name='country_embeddings')(country_input)
country_features = layers.Reshape((2,), name='concat_country')(country_features)

province_features = layers.Embedding(500, 5, name='province_embeddings')(province_input)
province_features = layers.Reshape((5,), name='concat_province')(province_features)

variety_features = layers.Embedding(1000, 4, name='variety_embeddings')(variety_input)
variety_features = layers.Reshape((4,), name='concat_variety')(variety_features)

# Merge all available features into a single large vector via concatenation
feature_vector = layers.concatenate([word_features, year_features, country_features, province_features, variety_features, price_input])
x = layers.Dropout(0.2)(feature_vector)
x = layers.Dense(256, activation='relu', name='Hidden')(x)
# Outputs:
predictions = layers.Dense(1, name="output")(x)

# Instantiate an end-to-end model predicting E,I,O:
model = Model(
    inputs=[description_input, year_input, country_input, province_input, variety_input, price_input],
    outputs=predictions,
)

In [None]:
tf.keras.utils.plot_model(model, show_dtype=True, show_shapes=True, show_layer_names=True)

In [None]:
model.summary()

In [None]:
training_set = features_df.sample(frac=0.8, random_state=42)
test_set = features_df[~features_df.index.isin(training_set.index)]

In [None]:
assert(len(training_set) + len(test_set) == len(wine_reviews))

In [None]:
model.compile(
    optimizer=tf.optimizers.Adam(),
    loss='mean_absolute_error')

In [None]:
%%time
history = model.fit(
    {"description": training_set[description_cols].values, 
     "year": training_set['year'].values,
     "country": training_set['country'].values,
     "province": training_set['province'].values,
     "variety": training_set['variety'].values, 
     'price': training_set['price'].values},
    
    {"output": training_set['points'].values},
    validation_data=([test_set[description_cols].values, 
                      test_set['year'].values, 
                      test_set['country'].values, 
                      test_set['province'].values, 
                      test_set['variety'].values, 
                      test_set['price'].values],
                     test_set['points'].values),
    epochs=10,
    batch_size=512,
    verbose=1)

In [None]:
history.history

In [None]:
test_set['dnn_prediction'] = model.predict({'description': test_set[description_cols], 
                                            'year': test_set['year'], 
                                            'country': test_set['country'], 
                                            'province': test_set['province'], 
                                            'variety': test_set['variety'], 
                                            'price': test_set['price']})

In [None]:
calc_prediction_quality(test_set, 'dnn_prediction', target_col)