In [46]:
import time
import json
from typing import List
import os
from collections import defaultdict
import pandas as pd
from pathlib import Path
import requests
from dotenv import load_dotenv, find_dotenv
import markdown
from bs4 import BeautifulSoup
import random
import spacy
from spacy.util import minibatch, compounding

In [47]:
from src.data.cosmos import GremlinQueryManager, DocumentQueryManager
from src.data.graph.gremlin import GremlinQueryBuilder
from src.data.cloud._constants import Cloud

In [48]:
load_dotenv(find_dotenv())

account_name = os.environ.get('COSMOS_ACCOUNT_NAME')
db_name = os.environ.get('COSMOS_DB_NAME')
graph_name = os.environ.get('COSMOS_GRAPH_NAME')
master_key = os.environ.get('COSMOS_MASTER_KEY')

graph_name = 'main'

gremlin_qm = GremlinQueryManager(account_name, master_key, db_name, graph_name)
doc_qm = DocumentQueryManager(account_name, master_key, db_name)

In [49]:
gremlin_qm.query('g.V().has("label", "cloud").values("name")')

['Alibaba',
 'Amazon Web Services',
 'Microsoft Azure',
 'DigitalOcean',
 'Google Cloud',
 'IBM Cloud',
 'Oracle Cloud']

In [50]:
all_cloud_data = []
services_names = {}

abbrs = gremlin_qm.query('g.V().has("label", "cloud").values("abbreviation")')
for abbr in abbrs:
    q = f"""g.V().has("label", "{abbr}_service")
            .project("id", "name", "shortDescription", "longDescription", "uri", "iconUri", "categories", "relatedServices")
            .by("id").by("name").by("short_description").by("long_description").by("uri").by("icon_uri")
            .by(out("belongs_to").project("id", "name").by(values("id")).by(values("name")).fold())
            .by(coalesce(out("related_service").project("id", "name").by(values("id")).by(values("name")).fold(), __.not(identity()).fold()))"""
    cloud_data = gremlin_qm.query(q)
    
    all_cloud_data += cloud_data
    services_names[abbr] = [s['name'] for s in cloud_data]
    
services_names['azure'][:5]

['Azure Bot Service',
 'Azure Databricks',
 'Azure Search',
 'Bing Autosuggest',
 'Bing Custom Search']

In [51]:
for s in all_cloud_data:
    if s['name'] == 'Azure Functions':
        print(s)

{'id': '4ed0abb5-43af-4d8f-927d-2920f2a44d54', 'name': 'Azure Functions', 'shortDescription': 'Process events with serverless code', 'longDescription': 'Improve end-to-end development experience using a serverless compute platform with automated, flexible scaling. Choose programming languages and hosting options that best fit your business.', 'uri': 'https://azure.microsoft.com/en-us/services/functions/', 'iconUri': '', 'categories': [{'id': 'bd05cb77-2b71-4d9f-a97b-c0e6e1343155', 'name': 'Compute'}, {'id': '946a525d-5ee9-40b9-9aaf-9789527a7587', 'name': 'Containers'}, {'id': '3d0e3bbb-5ef0-42db-bbac-a43f8472dc5d', 'name': 'Internet of Things'}], 'relatedServices': [{'id': 'eed8e9c5-2b9d-41ee-a9d1-2e20b53bc058', 'name': 'AWS Lambda'}, {'id': '1644b196-5c0b-4518-81e1-c3e77ca66332', 'name': 'Google Cloud Functions'}]}


## Fetch Training Data from Bing using Apify crawler

In [7]:
def apify_training_data(terms: List[str], suffix: str = ''):
    output_urls = []
    errors = []
    for i, s in enumerate(terms):
        if i > 1 and i % 30 == 0:
            print(f'{i}/{len(terms)} terms searched')
            time.sleep(10)
        try:
            res = requests.post(
                'https://api.apify.com/v2/acts/G4B5HdLYCXDyiafhy/runs?token=BDR3StxrybGSpod9z5Ara2o83',
                json={
                    'q': f'{s} {suffix}'
                }
            )
            res.raise_for_status()
            kv_store = res.json()['data']['defaultKeyValueStoreId']
            output_url = f"https://api.apify.com/v2/key-value-stores/{kv_store}/records/OUTPUT?disableRedirect=true"
            output_urls.append({'name': s, 'output_url': output_url})
            
        except:
            print(f'Error crawling term: {s}')
            errors.append(s)
    return output_urls, errors

In [None]:
cloud_output_urls = {}
for abbr, svc_names in services_names.items():
    print(f'Getting NER training data for: {abbr}')
    cloud_output_urls[abbr] = apify_training_data(svc_names)

In [19]:
apify_training_data(['Amazon MQ'])

([{'name': 'Amazon MQ',
   'output_url': 'https://api.apify.com/v2/key-value-stores/dL3JdN4SPZJtjJcha/records/OUTPUT?disableRedirect=true'}],
 [])

In [15]:

for abbr, output in cloud_output_urls.items():    
    output_urls, errors = apify_training_data(output[1])
    _output_urls = cloud_output_urls[abbr][0]
    _errors = cloud_output_urls[abbr][1]
    cloud_output_urls[abbr] = (output_urls + _output_urls, errors + _errors)

[]
[]
['Amazon MQ', 'Amazon AppSync', 'AWS Cost Explorer', 'Reserved Instance Reporting', 'AWS Cost and Usage Report', 'Amazon EC2 Auto Scaling', 'AWS DeepLens', 'AWS Ground Station']
[]
['Azure SQL Database', 'Azure Cosmos DB', 'Azure IoT Edge', 'Azure Spatial Anchors', 'Azure Visual Studio App Center', 'Azure DNS', 'Azure Virtual Network', 'Azure Load Balancer', 'Bing Autosuggest', 'Bing Custom Search', 'Bing Entity Search', 'Bing Image Search', 'Azure Video Indexer', 'Azure Kinect DK', 'Azure Stream Analytics', 'Azure SQL Data Warehouse', 'Azure HDInsight']
[]
['Kubernetes', 'Spaces Object Storage', 'Volumes Block Storage', 'Cloud Firewalls', 'Load Balancers', 'Floating IPs', 'DNS', 'DigitalOcean API', 'DigitalOcean CLI', 'Monitoring']
[]
['Compute Engine', 'Cloud Services Platform', 'Apigee Sense', 'Apigee healthcare APIx', 'Apigee Open Banking APIx', 'Cloud Healthcare API', 'Google Data Studio*', 'Cloud SQL', 'Cloud Bigtable', 'Cloud Spanner', 'Profiler (beta)', 'Transparent Servi

In [None]:
cloud_output_urls

In [18]:
import json
with open('./cloud_output_urls_training_data.json', 'w+') as f:
    json.dump(cloud_output_urls, f)

In [28]:
cloud_ner_examples = {}
for abbr, output in cloud_output_urls.items():
    print(abbr)
    data = [requests.get(o['output_url']).json() for o in output[0]]
    cloud_ner_examples[abbr] = data

alibaba
aws
azure
digitalocean
gcp
ibm
oracle


In [None]:
cloud_ner_examples['oracle'][0]

In [35]:
for abbr, data in cloud_ner_examples.items():
    path = f'../data/processed/ner_training_examples/{abbr}'
    os.makedirs(path, exist_ok=True)
    with open(f'{path}/bing_search_training_examples.jsonl', 'w+') as f:
        for svc in data:
            if 'webPages' in svc and 'news' in svc:
#                 print(svc['webPages'])
#                 raise
                svc_data = svc['webPages'] + svc['news']
                for svc_data in svc_data:
                    if 'paragraphs' in svc_data:
                        for p in svc_data['paragraphs']:
                            f.write(json.dumps({
                                'text': p,
                                'meta': {
                                    'source_url': svc_data['url']
                                }
                            }) + '\n')

In [9]:
import json
import pandas as pd


texts = []
with open ('../data/processed/ner_training_examples/aws/bing_search_training_examples.jsonl') as e_file:
    for line in e_file.readlines()[:10]:
        texts.append(json.loads(line)['text'])

df = pd.DataFrame(texts, columns=['text'])
df

Unnamed: 0,text
0,Amazon MQ is a managed message broker service ...
1,With Amazon MQ you can use the AWS Management ...
2,Amazon MQ makes it easy to migrate messaging t...
3,Amazon MQ provides high availability and messa...
4,"Amazon MQ offers low latency messaging, often ..."
5,SkipTheDishes lowered maintenance time and imp...
6,Malmberg improved messaging stability and redu...
7,"Dealer.com migrated messaging to Amazon MQ, ta..."
8,Bench Accounting improved broker resilience wi...
9,Implementing enterprise integration patterns w...


In [10]:
df.to_csv('../data/processed/ner_training_examples/aws/bing_search_training_examples.tsv', index=False)

---

## Create Pattern files for services

In [90]:
labels = {
    'alibaba': 'ALIBABA_CLOUD_SERVICE',
    'aws': 'AWS_SERVICE',
    'azure': 'AZURE_SERVICE',
    'digitalocean': 'DIGITAL_OCEAN_SERVICE',
    'gcp': 'GOOGLE_CLOUD_SERVICE',
    'ibm': 'IBM_CLOUD_SERVICE',
    'oracle': 'ORACLE_CLOUD_SERVICE'
}

In [91]:
print(services_names.keys())

dict_keys(['alibaba', 'aws', 'azure', 'digitalocean', 'gcp', 'ibm', 'oracle'])


In [92]:
all_patterns = []

os.makedirs('../data/processed/ner_training_patterns', exist_ok=True)
for abbr, services in services_names.items():
    cloud_patterns = []
    print(abbr)
    path = f'../data/processed/ner_training_patterns/{abbr}/patterns.jsonl'
    os.makedirs('/'.join(path.split('/')[:-1]), exist_ok=True)
    
    cloud_patterns.append({'label': labels[abbr], 'pattern': [{'LOWER': abbr}, {'IS_UPPER': True}]})
    cloud_patterns.append({'label': labels[abbr], 'pattern': [{'LOWER': abbr}, {'IS_UPPER': True}, {'IS_UPPER': True}]})
    cloud_patterns.append({'label': labels[abbr], 'pattern': [{'LOWER': abbr}, {'IS_UPPER': True}, {'IS_UPPER': True}, {'IS_UPPER': True}]})
    for s in services:
        split_ = s.split()
        if len(split_) > 1:
            p = [{'LOWER': w.lower()} for w in split_]
        else:
            p = s
        pattern = {'label': labels[abbr], 'pattern': p, 'id': s}
        cloud_patterns.append(pattern)
    
    all_patterns += cloud_patterns

    with open(path, 'w+') as abbr_patterns_file:
        for cp in cloud_patterns:
            abbr_patterns_file.write(json.dumps(cp) + '\n')

with open('../data/processed/ner_training_patterns/base_patterns.jsonl', 'w+') as base_patterns_file:
    for p in patterns:
        base_patterns_file.write(json.dumps(p) + '\n')

alibaba
aws
azure
digitalocean
gcp
ibm
oracle


In [93]:
cloud_patterns[:4]

[{'label': 'ORACLE_CLOUD_SERVICE',
  'pattern': [{'LOWER': 'oracle'}, {'IS_UPPER': True}]},
 {'label': 'ORACLE_CLOUD_SERVICE',
  'pattern': [{'LOWER': 'oracle'}, {'IS_UPPER': True}, {'IS_UPPER': True}]},
 {'label': 'ORACLE_CLOUD_SERVICE',
  'pattern': [{'LOWER': 'oracle'},
   {'IS_UPPER': True},
   {'IS_UPPER': True},
   {'IS_UPPER': True}]},
 {'label': 'ORACLE_CLOUD_SERVICE',
  'pattern': [{'LOWER': 'oracle'},
   {'LOWER': 'bare'},
   {'LOWER': 'metal'},
   {'LOWER': 'compute'}],
  'id': 'Oracle Bare Metal Compute'}]

In [55]:
import spacy

In [56]:
import spacy
from spacy.pipeline import EntityRuler

In [94]:
nlp = spacy.load('en_ner_cloud_lg')

ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler, name='updated_ruler', after='ner')

LOADING FROM DISK ENTITY RULER:  /mnt/c/users/kakh/Documents/cloud_compete_graph/.venv/lib/python3.6/site-packages/en_ner_cloud_lg/en_ner_cloud_lg-0.2.0/entity_ruler
<generator object read_jsonl at 0x7f5f7f10b728>


In [96]:
nlp.remove_pipe('entity_ruler')

('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x7f5fce10c518>)

In [68]:
ruler = nlp.get_pipe('entity_ruler')
ruler.add_patterns(patterns)

In [97]:
nlp.pipe_names

['sentencizer', 'tagger', 'parser', 'ner', 'updated_ruler']

In [100]:
nlp.rename_pipe('updated_ruler', 'entity_ruler')

In [101]:
nlp.to_disk('../models/en_ner_cloud_lg')

In [63]:
doc = nlp(u"You can create serverless functions on many different cloud platforms. The primary services are AWS Lambda, Azure Functions, Google Cloud Functions and IBM Cloud Functions.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('functions', 'AZURE_SERVICE'), ('AWS Lambda', 'AWS_SERVICE'), ('Azure Functions', 'AZURE_SERVICE'), ('Google Cloud Functions', 'GOOGLE_CLOUD_SERVICE'), ('IBM Cloud Functions', 'IBM_CLOUD_SERVICE')]


In [65]:
doc = nlp('Azure personalizer uses reinforcement learning to make intelligent recommendations')
print([(ent.text, ent.label_) for ent in doc.ents])

[('Azure personalizer', 'AZURE_SERVICE')]


In [86]:
len(nlp.get_pipe('entity_ruler').patterns)

1604

In [87]:
len(patterns)

834

In [70]:
nlp.get_pipe('entity_ruler').patterns

TypeError: unhashable type: 'dict'

In [26]:
nlp.to_disk('../models/prodigy_1.8-ner_cloud_lower_core_lg')

In [100]:
list(doc.sents)

[Create cloud functions with Google Serverless]

In [101]:
k = 3
for ent in doc.ents:
    if ent.label_.split('_')[0].lower() in ent.sent.text.lower():
        print(ent.text, ent.label_)

cloud functions GOOGLE_CLOUD_SERVICE
Serverless GOOGLE_CLOUD_SERVICE


In [None]:
import os
from typing import List
from collections import defaultdict
import spacy
from spacy.tokens import Span

from src.app.exceptions import DocumentParseError
from src.app.labels import LABELS


class CloudServiceExtractor:
    def __init__(self, search_client, model='en_ner_cloud_lg'):

        self.search_client = search_client

        print("Loading NER model...", end="")
        self.nlp = spacy.load(model)
        self.nlp.add_pipe(self.nlp.create_pipe('merge_entities'))
        print("Done")

    async def resolve_service_name(self, name, ent_label, threshold=0.8):
        """
        Resolve the name of the service from the 
        NER model to the search index
        """
        filter_ = f"cloud eq '{LABELS[ent_label]}'"
        res = await self.search_client.suggest(name, filter_=filter_)

        try:
            res.raise_for_status()
            suggestion = res.json()["value"][0] if res.json()["value"] else name

            if isinstance(suggestion, str):
                search_res = await self.search_client.search(name, filter_=filter_)
            else:
                search_res = await self.search_client.search(
                    suggestion["@search.text"], filter_=filter_
                )
            search_res.raise_for_status()
            top_res = search_res.json()["value"][0]
            return top_res
        except:
            print(f"Could not resolve: {name}")
            return None

    async def extract(self, text, ent_labels=list(LABELS.keys())):
        """
        Extract Named Entity Cloud services and relationships in text
        """
        try:
            doc = self.nlp(text)
        except ValueError:
            raise DocumentParseError

        res = []
        for ent in filter(lambda w: w.ent_type_ in ent_labels, doc):
            service = await self.resolve_service_name(ent.text, ent.ent_type_)
            if service:
                relation = None
                root_verb = None

                if ent.dep_ in ("attr", "dobj"):
                    subject = [w for w in ent.head.lefts if w.dep_ == "nsubj"]
                    if subject:
                        subject = subject[0]
                        relation = subject

                elif ent.dep_ == "pobj" and ent.head.dep_ == "prep":
                    relation = ent.head.head
                    cur = relation
                    while cur.head:
                        if cur.pos_ == "VERB":
                            root_verb = cur
                            break
                        if cur.head == cur:
                            break
                        cur = cur.head

                ent_span = Span(doc, ent.i, ent.i + 1, label=ent.ent_type)
                res.append((ent_span, service, relation, root_verb))

        return res


## Test public API

In [43]:
import requests
res = requests.get('https://microsoft-careers.search.windows.net/indexes/microsoft-careers-3/docs?api-version=2017-11-11&search=*', headers={'api-key': '69CA2479ED51AD1AE653457A8FA2B192'})
data = res.json()

In [45]:
vals = []

for i, v in enumerate(data['value'][1:]):
    val = {
        'recordId': f'a{i}',
        'data': {
            'text': v['description']
        }
    }
    vals.append(val)
vals[0:2]

[{'recordId': 'a0',
  'data': {'text': "Are you looking for an opportunity to make an impact delivering a new Dynamics 365 AI product to customers? Do you enjoy partnering and collaborating with multiple disciplines and enabling technology teams to deliver compliant solutions on time, with quality to customers? Do you enjoy planning and managing technology projects, leveraging data to make project level decisions? Microsoft Dynamics AI Center of Excellence group is searching for a Senior Program Manager who is looking for an opportunity to project manage complex and high value projects/releases across the product suite and team.In this role, you will have the opportunity to lead and coordinate the Planning, Execution and Release activities for a new product. You’ll work closely with our engineering, compliance, marketing and field teams to define project plans and ensure milestones are completed on time, with quality. This role involves leveraging data to make informed decisions about 

In [8]:
cloud_res = requests.post('https://cloudcompetegraph.azure-api.net/ner/v1/azure_cognitive_search', json={'values': vals[:10]})
cloud_res.text

