In [4]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import utils

### Prepare data

In [None]:
pretrained_embeddings_model_name = 'ml_projects_vector_embeddings_model'
custom_embeddings_model_name = 'ml_projects_custom_embeddings_model'

pipeline_name = 'pipeline'

In [5]:
with open('config.json', 'r') as f:
    dataset_name = json.load(f)['dataset_name']

data = utils.load_bids_df(dataset_name)
data['bid_description'] = data['bid_description'].fillna('')

In [6]:
X_1, X_2 = data['project_description'].values, data['bid_description'].values
y = data['bid_award_status'].values

pos_weight = len(y) / sum(y)
sample_weight = y * pos_weight + 1.

In [7]:
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler

cat_columns = ['bid_quality', 'bid_communication', 'bid_professionalism']

k_bins = KBinsDiscretizer(2, encode='ordinal',strategy='uniform', subsample=None)
data_cat = k_bins.fit_transform(data[cat_columns]).astype(bool)

for i, col in enumerate(cat_columns):
    data[col] = data_cat[:, i]

df_prepared = utils.prepare_df(data)
df_prepared = df_prepared.drop(['project_id', 'bid_award_status'], axis=1)

cols_num = df_prepared.select_dtypes(include=['number']).columns
cols_cat = df_prepared.select_dtypes(exclude=['number']).columns

scaler = StandardScaler()
X_num = scaler.fit_transform(df_prepared[cols_num])

X_3 = np.append(X_num, df_prepared[cols_cat], axis=1).astype('float32')

#### basic evaluation functions

In [None]:
def evaluate_model(model, test_set):
    metrics = model.evaluate(test_set)
    metrics = np.array(metrics)
    metrics[1:] *= 100
    print("loss: {:.2f}, accuracy: {:.2f}%, precision: {:.2f}%, recall: {:.2f}%".format(*metrics))

In [None]:
def get_accuracies(y,pred,project_ids,to=2):
    top_accuracy, top_2_accuracy = [],[]
    for project_id in set(project_ids):
        indexes = project_ids == project_id

        y_project,pred_project = y[indexes].reshape(-1), pred[indexes].reshape(-1)

        if sum(y_project) == 0 or len(y_project) < to or sum(pred_project) == 0: continue

        # choosing the best bid (along predictions)
        sorted_pred = np.argsort(pred_project)[:-to -1:-1]
        top_accuracy.append( y_project[sorted_pred[0]] )

        # choosing the top 2 best bids (along predictions)
        top_2_accuracy.append( any(y_project[ sorted_pred ]) )

    top_accuracy = np.mean(top_accuracy) * 100
    top_2_accuracy = np.mean(top_2_accuracy) * 100

    return top_accuracy, top_2_accuracy

### Train model

In [None]:
ds = tf.data.Dataset.from_tensor_slices(((X_1, X_2, ), y, sample_weight))

train_size,val_size = int(.7*len(data)), int(.15*len(data))

train_ds = ds.take(train_size).shuffle(1000).batch(32).prefetch(1)
val_ds = ds.skip(train_size)

test_ds = val_ds.skip(val_size).batch(32).prefetch(1)
val_ds = val_ds.take(val_size).batch(32).prefetch(1)

In [None]:
import tensorflow_hub as hub

embed_1 = keras.models.Sequential([
  hub.KerasLayer('https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1',
                 dtype=tf.string, input_shape=[], output_shape=[128],
                 name='embed_1', trainable=False),
])
embed_2 = keras.models.Sequential([
  hub.KerasLayer('https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1',
                 dtype=tf.string, input_shape=[], output_shape=[128],
                 name='embed_2', trainable=False),
])

In [None]:
concat = keras.layers.Concatenate()([embed_1.output, embed_2.output])

x = keras.layers.Dropout(.2)(concat)

x = keras.layers.Dense(32, activation='relu')(x)
x = keras.layers.Dense(16, activation='relu')(x)
outputs = keras.layers.Dense(1, activation='sigmoid')(x)

model = keras.models.Model(inputs=[embed_1.input, embed_2.input,],
                           outputs=[outputs])

model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              metrics=['accuracy', keras.metrics.Precision(),
                       keras.metrics.Recall()], weighted_metrics=[])

early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=16,
                                           restore_best_weights=True)
lr_scheduler = keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy',
                                                 patience=3, factor=.8)

In [None]:
model.fit(train_ds, epochs=100, validation_data=val_ds,
          callbacks=[early_stop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


<keras.src.callbacks.History at 0x7a00b45613c0>

In [None]:
evaluate_model(model, test_ds)

loss: 1.69, accuracy: 71.99%, precision: 11.88%, recall: 34.13%


In [None]:
test_projects = data.iloc[train_size+val_size:].reset_index(drop=True)

pred = model.predict(test_ds)

get_accuracies(test_projects['bid_award_status'].values, pred,
               test_projects['project_id'].values) # top 1 and top 2



(16.129032258064516, 29.838709677419356)

In [None]:
get_accuracies(test_projects['bid_award_status'].values, pred,
               test_projects['project_id'].values, to=3) # top 1 and top 3

(16.129032258064516, 42.74193548387097)

#### Combined model

In [None]:
ds = tf.data.Dataset.from_tensor_slices(((X_1, X_2, X_3), y, sample_weight))

train_size,val_size = int(.7*len(data)), int(.15*len(data))

train_ds = ds.take(train_size).shuffle(1000).batch(32).prefetch(1)
val_ds = ds.skip(train_size)

test_ds = val_ds.skip(val_size).batch(32).prefetch(1)
val_ds = val_ds.take(val_size).batch(32).prefetch(1)

In [None]:
model.trainable = False

input_2 = keras.layers.Input(shape=[14])

concat = keras.layers.Concatenate()([model.output, input_2])

x = keras.layers.Dense(8)(concat)
output = keras.layers.Dense(1, activation='sigmoid')(x)

combined_model = keras.models.Model(inputs=[embed_1.input, embed_2.input, input_2],
                                    outputs=[output])

In [None]:
combined_model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              metrics=['accuracy', keras.metrics.Precision(),
                       keras.metrics.Recall()], weighted_metrics=[])

early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=16,
                                           restore_best_weights=True)
lr_scheduler = keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy',
                                                 patience=3, factor=.8)

In [None]:
combined_model.fit(train_ds, epochs=100, validation_data=val_ds,
          callbacks=[early_stop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


<keras.src.callbacks.History at 0x7a00b31cfeb0>

In [None]:
evaluate_model(combined_model, test_ds)

loss: 1.83, accuracy: 76.86%, precision: 15.89%, recall: 38.10%


In [None]:
test_projects = data.iloc[train_size+val_size:].reset_index(drop=True)

pred = combined_model.predict(test_ds)

get_accuracies(test_projects['bid_award_status'].values, pred,
               test_projects['project_id'].values) # top 1 and top 2



(21.774193548387096, 35.483870967741936)

In [None]:
get_accuracies(test_projects['bid_award_status'].values, pred,
               test_projects['project_id'].values, to=3) # top 1 and top 3

(21.774193548387096, 52.41935483870967)

### Saving model and pipeline

In [None]:
combined_model.save(f'models/{pretrained_embeddings_model_name}.h5')

  saving_api.save_model(


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
    ('scaler', scaler, cols_num),
    ('sealed', 'passthrough', ['bid_sealed']),
    ('cat_pipeline', k_bins, cat_columns),
    ('greetings', 'passthrough', ['bid_greetings'])
])
full_pipeline = full_pipeline.fit(df_prepared)

In [None]:
full_pipeline.transform(df_prepared).astype('float32') == X_3 # for the whole dataset

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [None]:
import joblib

joblib.dump(full_pipeline, f'models/{pipeline_name}.json')

['models/pipeline.json']

### Custom embeddings

In [None]:
ds = tf.data.Dataset.from_tensor_slices(((X_1, X_2), y, sample_weight))

train_size,val_size = int(.7*len(data)), int(.15*len(data))

train_ds = ds.take(train_size)
val_ds = ds.skip(train_size)

test_ds = val_ds.skip(val_size)
val_ds = val_ds.take(val_size)

In [None]:
def preprocess_batch(X_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)

    return X_batch.to_tensor(default_value=b"<pad>")

def preprocess(X_batch, y_batch, sample_weight):

    X_1 = preprocess_batch(X_batch[0])
    X_2 = preprocess_batch(X_batch[1])

    return (X_1, X_2, *X_batch[2:]), y_batch, sample_weight

In [None]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch, _ in ds.batch(32).map(preprocess):

    for project_description in X_batch[0]:
        vocabulary.update(list(project_description.numpy()))
    for bid_description in X_batch[1]:
        vocabulary.update(list(bid_description.numpy()))

In [None]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

In [None]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"I think it's impoossible".split():
    print(word_to_id.get(word) or vocab_size)

1
348
784
10000


In [None]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
table.lookup(tf.constant([b"I think it's impoossible".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[    1,   348,   784, 10737]])>

In [None]:
pad_token = table.lookup(tf.constant([b"<pad>".split()]))[0][0]

def encode_words(X_batch, y_batch, sample_weight):
  X_1 = table.lookup(X_batch[0])
  X_2 = table.lookup(X_batch[1])

  return (X_1,X_2, *X_batch[2:]), y_batch, sample_weight

train_set = train_ds.batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
val_set = val_ds.batch(32).map(preprocess).map(encode_words).prefetch(1)
test_set = test_ds.batch(32).map(preprocess).map(encode_words).prefetch(1)

In [None]:
embed_size = 64
embed_1 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size+num_oov_buckets, embed_size,
                                 mask_zero=True, input_shape=[None]),
])

embed_2 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size+num_oov_buckets, embed_size,
                                 mask_zero=True, input_shape=[None]),
])

concat = keras.layers.Concatenate(axis=1,
                                  name="concat_1")([embed_1.output, embed_2.output])

x = keras.layers.GRU(128, return_sequences=True, name='gru_1.2', recurrent_dropout=.3)(concat)
x = keras.layers.GRU(64, name='gru_2', recurrent_dropout=.3)(x)

output = keras.layers.Dense(1, activation='sigmoid')(x)

model = keras.models.Model(inputs=[embed_1.input, embed_2.input], outputs=[output])

model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()],
              weighted_metrics=[])



In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 embedding_input (InputLaye  [(None, None)]               0         []                            
 r)                                                                                               
                                                                                                  
 embedding_1_input (InputLa  [(None, None)]               0         []                            
 yer)                                                                                             
                                                                                                  
 embedding (Embedding)       (None, None, 64)             704000    ['embedding_input[0][0]']     
                                                                                              

In [None]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10,
                                           restore_best_weights=True)

model.fit(train_set, validation_data=val_set, epochs=6, callbacks=[early_stop])

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.src.callbacks.History at 0x793d0fd1f190>

In [None]:
evaluate_model(model, test_set)

loss: 2.30, accuracy: 63.41%, precision: 11.41%, recall: 46.83%


In [None]:
test_projects = data.iloc[train_size+val_size:].reset_index(drop=True)

pred = model.predict(test_set)

get_accuracies(test_projects['bid_award_status'].values, pred,
               test_projects['project_id'].values) # top 1 and top 2



(25.0, 35.483870967741936)

In [None]:
get_accuracies(test_projects['bid_award_status'].values, pred,
               test_projects['project_id'].values, to=3) # top 1 and top 3

(25.0, 45.96774193548387)

In [None]:
ds = tf.data.Dataset.from_tensor_slices(((X_1, X_2, X_3), y, sample_weight))

train_size,val_size = int(.7*len(data)), int(.15*len(data))

train_ds = ds.take(train_size)
val_ds = ds.skip(train_size)

test_ds = val_ds.skip(val_size)
val_ds = val_ds.take(val_size)

train_set = train_ds.batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

val_set = val_ds.batch(32).map(preprocess).map(encode_words).prefetch(1)
test_set = test_ds.batch(32).map(preprocess).map(encode_words).prefetch(1)

In [None]:
model.trainable = False

input_2 = keras.layers.Input(shape=[14])

concat = keras.layers.Concatenate()([model.output, input_2])

x = keras.layers.Dense(8)(concat)
output = keras.layers.Dense(1, activation='sigmoid')(x)

combined_model = keras.models.Model(inputs=[embed_1.input, embed_2.input, input_2],
                                    outputs=[output])

In [None]:
combined_model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=1e-4),
              metrics=['accuracy', keras.metrics.Precision(),
                       keras.metrics.Recall()], weighted_metrics=[])

In [None]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10,
                                           restore_best_weights=True)

combined_model.fit(train_set, validation_data=val_set, epochs=5,
                   callbacks=[early_stop])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x793d1296b490>

In [None]:
evaluate_model(combined_model, test_set)

loss: 1.89, accuracy: 53.52%, precision: 9.45%, recall: 50.00%


In [None]:
test_projects = data.iloc[train_size+val_size:].reset_index(drop=True)

pred = combined_model.predict(test_set)

get_accuracies(test_projects['bid_award_status'].values, pred,
               test_projects['project_id'].values) # top 1 and top 2



(16.129032258064516, 27.419354838709676)

In [None]:
get_accuracies(test_projects['bid_award_status'].values, pred,
               test_projects['project_id'].values, to=3) # top 1 and top 2

(16.129032258064516, 39.516129032258064)

### Saving model and pipeline

In [None]:
combined_model.save(f'models/{custom_embeddings_model_name}.h5')

  saving_api.save_model(


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
    ('scaler', scaler, cols_num),
    ('sealed', 'passthrough', ['bid_sealed']),
    ('cat_pipeline', k_bins, cat_columns),
    ('greetings', 'passthrough', ['bid_greetings'])
])
full_pipeline = full_pipeline.fit(df_prepared)

In [None]:
import joblib

joblib.dump(full_pipeline, f'models/{pipeline_name}.json')

['models/pipeline.json']

In [None]:
import json

with open('models/vocabulary.json', 'w') as f:
    json.dump([str(elem) for elem in truncated_vocabulary], f)