# TF/IDF and Embedding Approach

**Purpose**: This Notebook attempts to concatenate TF/IDF vectors and embedding vectors from TensorHub model & Create a combined model that uses both sources of data. We found that this model did not outperform our standard TF/IDF model, and so we did not pursue it further.

**Note**: To run this notebook, you must have previously preproccessed the email data. To preprocess the email data and pickle it for later use, run up until the preproccessing steps in the `tfidf-model` ipynb file (located in `../tfidf_models`).

#### Authors: Jake Epstein & Matt Kenney

#### Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Dropout, Flatten, Input
import tensorflow_hub as hub

import matplotlib.pyplot as plt

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.0.1
Eager mode:  True
Hub version:  0.7.0
GPU is available


In [2]:
import sys
sys.path.insert(1, '../nlp_engine')
from MLFunctions import PrintDot, plot_history, clear_memory, test_with_uncertainty, predict_with_uncertainty, get_monte_carlo_accuracy

#### Import Data from pickle, put into correct format

In [3]:
assigned_bugs = pd.read_pickle("../data/pickles/preprocessed_bugs.pkl")

In [4]:
assigned_bugs['combined'].iloc[4]

'v  dx  api  automation  need  description  fix swagger  generator  use  automation  short  description  description  exist  v  dx  api  proper  short  description  assignment  v  list  assignment  case  view  v  view  metadata  give  case  note  properly  openapi  tab  private  edit  rest  service  rule  questionsave  it  refresh'

In [5]:
category = 'combined'

#### Set up TF/IDF and Bytes Literal Text (to be fed into Google Hub Model)

In [6]:
# one-hot encode labels
backlog_labels = pd.get_dummies(assigned_bugs['backlog_id'])

In [7]:
bytes_literal_text = assigned_bugs[category].astype('|S') # Bytes Literal

In [8]:
# split labels for tf/idf and embedding
seed = np.random.randint(1000)
train_tf_features, test_tf_features, train_labels, test_labels = train_test_split(assigned_bugs[category], backlog_labels, test_size=0.2, random_state=seed)
train_em_features, test_em_features, train_labels, test_labels = train_test_split(bytes_literal_text, backlog_labels, test_size=0.2, random_state=seed)

In [9]:
# 4. Use a TF/IDF Vectorizer to convert plain text descprtions into TF/IDF vectors.
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True,
                        binary=False,
                        min_df=3,
                        max_df=0.5, 
                        norm='l2', 
                        ngram_range=(1, 2),
                        lowercase=True)

train_tf_features = pd.DataFrame(tfidf_vectorizer.fit_transform(train_tf_features).toarray()) # Fit the Vectorizer to the train data
test_tf_features = pd.DataFrame(tfidf_vectorizer.transform(train_em_features).toarray()) # Only transform (don't fit) the test data to emulate real-world predictions

#hashing_vectorizer = HashingVectorizer(n_features=2**14)
#train_features = pd.DataFrame(hashing_vectorizer.transform(train_features).toarray())
#test_features = pd.DataFrame(hashing_vectorizer.transform(test_features).toarray())

train_tf_features.shape

(4347, 16675)

In [10]:
train_tf_features = train_tf_features.astype('float32')
test_tf_features = test_tf_features.astype('float32')
train_labels = train_labels.astype('float32')
test_labels = test_labels.astype('float32')

# Leave train em features as bytes literal since this is what google hub model expects
# train_em_features = pd.DataFrame(train_em_features)
# test_em_features = pd.DataFrame(test_em_features)

#### Define and build model

In [11]:
def build_entire(embedding_model_url, emfeatures, tffeatures, labels, optimizer, lr, activation, embedding_layer_size, layer1_size, layer2_size=None, layer3_size=None, dropout_rate=0.3, mc=False):
    
    main_input = Input(shape=[], dtype=tf.string, name='main_input')
    hub_layer = hub.KerasLayer(embedding_model_url, input_shape=[], dtype=tf.string, trainable=False)
    hub_out = hub_layer(main_input)
    
    auxiliary_input = Input(shape=(len(tffeatures.keys()),), name='aux_input')
    
    ## Embedding Output
    emb_x = Dense(embedding_layer_size, activation=activation)(hub_out)
    embedding_output = Dense(len(labels.keys()), activation='softmax', name='embedding_output')(emb_x)
    
    
    ## Combined Output
    x = keras.layers.concatenate([hub_out, auxiliary_input])

    x = Dropout(dropout_rate, trainable=mc)(x)
    x = Dense(layer1_size, activation=activation)(x)
    
    if layer2_size:
        x = Dropout(dropout_rate, trainable=mc)(x)
        x = Dense(layer2_size, activation=activation)(x)
    
    if layer3_size:
        x = Dropout(dropout_rate, trainable=mc)(x)
        x = Dense(layer3_size, activation=activation)(x)
    
    x = Dropout(dropout_rate, trainable=mc)(x)
    main_output = Dense(len(labels.keys()), activation='softmax', name='main_output')(x)
    
    model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output, embedding_output])
    
    
    # Parameters
    if optimizer == 'adam':
        optimizer = tf.keras.optimizers.Adam(lr)
        
    elif optimizer == 'rmsprop':
        optimizer = tf.keras.optifmizers.RMSprop(lr)
        
    else:
        print("ERROR: No optimizer passed")
        return None

    model.compile(loss='kullback_leibler_divergence',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    return model

In [12]:
# If the model already exists, make sure to deallocate it & clear system memory before allocating a new model:
try:
    model
except NameError:
    model = None
    
clear_memory(model) # Clear VRAM or RAM

model = build_entire(embedding_model_url="https://tfhub.dev/google/universal-sentence-encoder/4", 
             emfeatures=train_em_features, 
             tffeatures=train_tf_features, 
             labels=train_labels, 
             optimizer='adam', 
             lr=0.0001,
             activation='tanh',
             embedding_layer_size=256,
             layer1_size =2048,
             layer2_size=512, 
             layer3_size=None,
             dropout_rate=0.3, 
             mc=True)

In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        (None, 512)          256797824   main_input[0][0]                 
__________________________________________________________________________________________________
aux_input (InputLayer)          [(None, 16675)]      0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 17187)        0           keras_layer[0][0]                
                                                                 aux_input[0][0]              

In [14]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_main_output_accuracy', patience=20)
history = model.fit([train_em_features.to_numpy(), train_tf_features.to_numpy()],
                    [train_labels.to_numpy(), train_labels.to_numpy()],
                    epochs=1000,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks = [early_stop],
                    verbose=1)

Train on 3912 samples, validate on 435 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
