In [1]:
# Data Processing
import pandas as pd
import re
import sys
# BERT
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from scipy.spatial.distance import cosine

In [2]:
def initialize_tokenizer(pretrained_model):
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained(pretrained_model)
    return tokenizer

def get_index(tokenized_answer, tokenized_context):
    result=[]
    sll=len(tokenized_answer)
    for ind in (i for i,e in enumerate(tokenized_context) if e==tokenized_answer[0]):
        if tokenized_context[ind:ind+sll]==tokenized_answer:
            result = [ind,ind+sll-1]
            break
    return result

In [3]:
pretrained_model = 'bert-base-uncased'
tokenizer = initialize_tokenizer(pretrained_model)
model = BertModel.from_pretrained(pretrained_model)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [7]:
def get_contextual_vector(model, tokenizer, phrase, context):
    encoded_answer_context = "[CLS] " + context + " [SEP]"

    encoded_answer_context = encoded_answer_context
    tokenized_encoded_answer_context = tokenizer.tokenize(encoded_answer_context)[:512]

    indexed_encoded_answer_context = tokenizer.convert_tokens_to_ids(tokenized_encoded_answer_context)
    segments_ids_encoded_answer_context= [1] * len(tokenized_encoded_answer_context)
    tokens_tensor_encoded_answer_context = torch.tensor([indexed_encoded_answer_context])
    segments_tensors_encoded_answer_context = torch.tensor([segments_ids_encoded_answer_context])

    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers_answer_context, _ = model(tokens_tensor_encoded_answer_context, segments_tensors_encoded_answer_context)
    token_vecs_answer_context= encoded_layers_answer_context[11][0]

    tokenized_answer = tokenizer.tokenize(phrase)
    indexes = get_index(tokenized_answer, tokenized_encoded_answer_context)

    if len(indexes) == 0:
        print ("check if question title is present in the question context")
        return None
    else:
        first_index, last_index = indexes[0], indexes[1] 
        return torch.mean(token_vecs_answer_context[first_index:last_index + 1], dim=0)

In [137]:
exp1 = "Machine learning: Customer segmentation using PCA and HDBSCAN (clustering). Ensemble machine learning model to predict churn likelihood. Recommendation system using collaborative filtering (ALS) and Bayesian ranking. Architecture leadership: Designed feature store for Machine Learning operationalization framework using S3, Athena, and Redshift (AWS).Designed experimentation platform to improve the speed, prevalence, and rigor of A/B testing."

#exp2 = "We use, 1. a huge amount of Python 2. AWS 3. Jenkins 4. Data science libraries like Pandas, Numpy, Numba, Dask, etc. Responsible for, 1. Dealing with code complexity and efficient algorithm design 2. Writing and migrating credit valuation models from SASNPV into Python 3. Database automation (Redshift, Snowflake) 4. Handling AWS and developing applications that run in the cloud 5. Ensuring that our code is compliant and adheres to the company’s code quality standards"

exp2 = "Webb app development using AWS resources, particularly Elastic Beanstalk. Employed a Micro-Service architecture to ensure leaner development lifecycle for application running services. Architected a Continuous Integration and Continuous Delivery (CI/CD) Pipeline on AWS that is activated for every commit to project’s Source Control Manager (SCM) to automate customized CloudFormation template aimed at expediting the software delivery process."


job = ""

#job = "A successful candidate will be a person who enjoys diving deep into data, doing analysis, discovering root causes, and designing long-term solutions. Person who enjoys using cloud platforms like AWS (Athena and EC2), GCP and/or Azure. Undersdtand machine learning models like linear regression, losgistic regression and ensemble models like Random Forest"

#exp1 = "Used Pytorch for research"
#exp2 = "Built deep learning pipeline usig Pytorch"
#job = "Built Machine learning and Deep Learning Models using AWS"

job_vec = get_contextual_vector(model, tokenizer, "aws", job)
vec1 = get_contextual_vector(model, tokenizer, "aws", exp1.lower())
vec2 = get_contextual_vector(model, tokenizer, "aws", exp2.lower())

In [138]:
round(1 - cosine(vec1,vec2),3)

0.663

In [139]:
round(1 - cosine(vec2,job_vec),3)

0.653