This script will handle 4 in the ticket KE144.

INSTRCUTION in KE144.

- We should set up continuous QA for ontology stored in the SQL db that has the following steps:
1. Checking for null skills and definitions in each locale (relative to the main skills table).

2. Running language detection and monitoring for mismatches between locales and ML predictions.

3. Identifying a character set for each locale and checking that no superfluous characters show up in each locale.

4. Vectorizing titles and definitions and checking that similarity thresholds do not fall below a certain threshold.

5. Checking for trailing and leading whitespace.

# Database Connection

In [1]:
import sys
import importlib
sys.path.insert(0, ".")

import sql_functions as sf
from sql_functions import *
importlib.reload(sf)

<module 'sql_functions' from 'C:\\Users\\KeikoGolden\\ComputationalLiguistics\\Annotaion\\bn_nb_s\\PythonTocheckModel\\PullRequests\\.\\sql_functions.py'>

In [2]:
import yaml
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from psycopg2.extras import execute_values

with open("config.yml") as f:
    config = yaml.safe_load(f)

In [3]:
conn = psycopg2.connect(
        database=config['db']['name'], 
        user=config['db']['user'], 
        host=config['db']['host'], 
        password=config['db']['password']
    )

cur = conn.cursor()

# 1. Create a file

In [4]:
import pandas as pd

command = """select skill, skill_definition from skills_en_us"""
df = pd.read_sql(command, conn)
df



Unnamed: 0,skill,skill_definition
0,.NET Assemblies,Defined by Microsoft for use in recent version...
1,.NET Reflector,".NET Reflector is a class browser, decompiler ..."
2,.NET Remoting,.NET Remoting is a Microsoft application progr...
3,2020 Design,2020 Design is a kitchen and bathroom design s...
4,2D Gel Analysis Software,"In quantitative proteomics, these tools primar..."
...,...,...
12638,Quarry Drilling,Drilling and blasting is the controlled use of...
12639,Radio Direction,A radio direction finder is a device for findi...
12640,Abrasive Blasting,"Abrasive blasting, more commonly known as sand..."
12641,Heavy-metal Machine Operation,A heavy equipment operator operates heavy equi...


In [5]:
df = df.dropna()

In [6]:
len(df)

12517

# 2. Using SkyHive vectorizer

In [7]:
import spacy

nlp = spacy.load('en_core_web_lg')
docs = list(nlp.pipe(df['skill'][:350].tolist())) #change list_of_all_skills to the list you have

tokenized_sents = []
for doc in docs: #doc is a skill
    tokens = [i.text for i in doc]
    tokenized_sents.append(tokens)

In [8]:
tokenized_sents

[['.NET', 'Assemblies'],
 ['.NET', 'Reflector'],
 ['.NET', 'Remoting'],
 ['2020', 'Design'],
 ['2D', 'Gel', 'Analysis', 'Software'],
 ['Closed', '-', 'Loop', 'Medication'],
 ['35', 'Mm', 'Films'],
 ['3D', 'Camcorder'],
 ['3D', 'Programming'],
 ['3D', 'Recognition', 'Systems'],
 ['3D', 'Reconstruction'],
 ['Global', 'Command', 'And', 'Control', 'Systems'],
 ['3D', 'Graphics'],
 ['3D', 'Modelling'],
 ['.NET', 'Core'],
 ['.NET', 'Framework'],
 ['3D'],
 ['360', 'Degree', 'Thinking'],
 ['3D', 'Design'],
 ['3D', 'Printing'],
 ['2D', 'Animation'],
 ['2D', 'Motion', 'Graphic'],
 ['3D', 'Motion', 'Graphic'],
 ['2D', 'Printing'],
 ['3D', 'Analyst', 'Extension'],
 ['Twelve', '-', 'Factor', 'App'],
 ['Category', '2', 'Cable'],
 ['Granim.js'],
 ['3D', 'scene', '-', 'graph', 'architecture'],
 ['3D', 'Seismic', 'Interpretation'],
 ['510(k', ')'],
 ['529', 'College', 'Savings', 'Planning'],
 ['Ab', 'Initio', 'Algorithm'],
 ['Cluster', 'Management'],
 ['Firebird', 'Database'],
 ['A', '/', 'B', 'testing

##### Option 1: Vectorize small batches

In [None]:
import boto3
import json
from typing import *

region = "ca-central-1"
vectorizer_name = "vectorizer-cpu-2"
sagemaker_runtime = boto3.client("sagemaker-runtime", region)

def vectorize_remote(
        sents: List[List[str]],
        offsets: List[List[List[int]]]) -> List[List[List[float]]]:
    data = {"sents": sents, "offsets": offsets}
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=vectorizer_name,
        Body=bytes(json.dumps(data), encoding="utf-8"),
        ContentType='application/json'

    )
    output = response['Body'].read().decode('utf-8')
    output = json.loads(json.loads(output)[0])
    return output

In [9]:
offsets = [[[0, len(sent)]] for sent in tokenized_sents]
offsets

[[[0, 2]],
 [[0, 2]],
 [[0, 2]],
 [[0, 2]],
 [[0, 4]],
 [[0, 4]],
 [[0, 3]],
 [[0, 2]],
 [[0, 2]],
 [[0, 3]],
 [[0, 2]],
 [[0, 5]],
 [[0, 2]],
 [[0, 2]],
 [[0, 2]],
 [[0, 2]],
 [[0, 1]],
 [[0, 3]],
 [[0, 2]],
 [[0, 2]],
 [[0, 2]],
 [[0, 3]],
 [[0, 3]],
 [[0, 2]],
 [[0, 3]],
 [[0, 4]],
 [[0, 3]],
 [[0, 1]],
 [[0, 5]],
 [[0, 3]],
 [[0, 2]],
 [[0, 4]],
 [[0, 3]],
 [[0, 2]],
 [[0, 2]],
 [[0, 4]],
 [[0, 1]],
 [[0, 2]],
 [[0, 2]],
 [[0, 3]],
 [[0, 2]],
 [[0, 4]],
 [[0, 2]],
 [[0, 2]],
 [[0, 2]],
 [[0, 3]],
 [[0, 2]],
 [[0, 2]],
 [[0, 2]],
 [[0, 2]],
 [[0, 2]],
 [[0, 6]],
 [[0, 10]],
 [[0, 3]],
 [[0, 1]],
 [[0, 2]],
 [[0, 2]],
 [[0, 4]],
 [[0, 1]],
 [[0, 2]],
 [[0, 2]],
 [[0, 4]],
 [[0, 2]],
 [[0, 1]],
 [[0, 2]],
 [[0, 2]],
 [[0, 4]],
 [[0, 3]],
 [[0, 2]],
 [[0, 2]],
 [[0, 3]],
 [[0, 3]],
 [[0, 1]],
 [[0, 3]],
 [[0, 3]],
 [[0, 2]],
 [[0, 4]],
 [[0, 9]],
 [[0, 2]],
 [[0, 2]],
 [[0, 4]],
 [[0, 3]],
 [[0, 4]],
 [[0, 4]],
 [[0, 2]],
 [[0, 4]],
 [[0, 3]],
 [[0, 5]],
 [[0, 5]],
 [[0, 5]],
 [[0, 2]]

In [None]:
len(offsets)

In [None]:
vectors = vectorize_remote(
    tokenized_sents,
    offsets
)

vectors

In [None]:
len(vectors)

##### Option2 Concurrent vectorization

In [10]:
offsets_df = pd.DataFrame({'offsets': offsets})

In [11]:
tokenized_sents_df = pd.DataFrame({'tokenized_sents': tokenized_sents})

In [12]:
import boto3
import json
from typing import *

region = "ca-central-1"
vectorizer_name = "vectorizer-cpu-2"
sagemaker_runtime = boto3.client("sagemaker-runtime", region)

In [13]:
import concurrent
import multiprocessing as mp

def concurrent_vectorization(
            sent_chunks: List[List[List[str]]],
            offset_chunks: List[List[List[List[int]]]]
    ) -> List[List[List[List[float]]]]:
        """Concurrency wrapper for self.vectorize"""
        results = []
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=mp.cpu_count() * 2) as executor:
            for result in executor.map(vectorize(sent_chunks, offset_chunks), sent_chunks, offset_chunks):
                results.extend(result)
        return results

In [14]:
def vectorize(sents: List[List[str]],
                  offsets: List[List[List[int]]]) -> List[List[List[float]]]:
        data = {"sents": sents, "offsets": offsets}
        response = sagemaker_runtime.invoke_endpoint(
            EndpointName=vectorizer_name,
            Body=bytes(json.dumps(data), encoding="utf-8"),
            ContentType='application/json'

        )
        output = response['Body'].read().decode('utf-8')
        output = json.loads(json.loads(output)[0])
        return output

In [None]:
# vectorization
        tokenized_sents = list(df.token)
        tokenized_batches = [tokenized_sents[i:i + self.chunk_size] for i
                             in range(0, len(tokenized_sents), self.chunk_size)]
        offsets = list(df.combined_spans_shifted)
        offset_batches = [offsets[i:i + self.chunk_size] for i in
                          range(0, len(offsets), self.chunk_size)]
        vectors = self.concurrent_vectorization(tokenized_batches,
                                                offset_batches)
        df['vector'] = vectors
        vector_batches = [vectors[i:i + self.chunk_size] for i in range(0,
                          len(vectors), self.chunk_size)]

In [16]:
tokenized_sents = list(tokenized_sents_df.tokenized_sents)
tokenized_batches = [tokenized_sents[i:i + 50] for i in range(0, len(tokenized_sents), 50)]
offsets = list(offsets_df.offsets)
offset_batches = [offsets[i:i + 50] for i in range(0, len(offsets), 50)]
vectors = concurrent_vectorization(tokenized_batches, offset_batches)
df['vector'] = vectors
vector_batches = [vectors[i:i + 50] for i in range(0, len(vectors), 50)]

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{
  "code": 400,
  "type": "InternalServerException",
  "message": "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
}
". See https://ca-central-1.console.aws.amazon.com/cloudwatch/home?region=ca-central-1#logEventViewer:group=/aws/sagemaker/Endpoints/vectorizer-cpu-2 in account 596298976885 for more information.

In [None]:
offset_batches

# 3. Using Transformer vectorizer

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
sentence_embeddings_skill = model.encode(df.skill.tolist())

In [None]:
sentence_embeddings_skill.shape

In [None]:
#Before vectorizing the definition, remove stop words and punctuation from it, so that embedding is fast and that you can make
# a strong vectorising.
import spacy
nlp = spacy.load('en_core_web_lg')

def preprocess(text):
    doc = nlp(text)
    
    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(no_stop_words)

In [None]:
# Above function call to create a list of preprocessed definitions.
preprocessed_definition = []
for definition in df.skill_definition.tolist():
    results = preprocess(definition)
    preprocessed_definition.append(results)

In [None]:
preprocessed_definition[:3]

In [None]:
sentence_embeddings_definition = model.encode(preprocessed_definition)

In [None]:
sentence_embeddings_definition.shape

In [None]:
sentence_embeddings_definition[:3]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

minor_title = []
normlised_title = []
sim_score = []

for i, job_title in enumerate(sentence_embeddings_job_title):
    for j, title in enumerate(sentence_embeddings_minor_job_title):
        score = cosine_similarity([job_title],[title])
        if score > 0.80:
            minor_title.append(df.final_title_post_processed.iloc[j])
            normlised_title.append(df4.Column1.iloc[i])
            sim_score.append(score)

In [None]:
vectorized_df = pd.DataFrame({'skill_vector': sentence_embeddings_skill.tolist(), 'def_vector':sentence_embeddings_definition.tolist()})
vectorized_df

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

sim_score = []
for i, row in vectorized_df.iterrows():
    score = cosine_similarity(np.array(row[0]).reshape(1,-1), np.array(row[1]).reshape(1,-1))
    sim_score.append(score[0][0])

In [None]:
sim_score[:5]

In [None]:
df['sim_score'] = sim_score
df

In [None]:
df.to_excel('title_vs_def_similarity.xlsx')