In [1]:
import os
import json
from collections import defaultdict
import pandas as pd
from pathlib import Path
import requests
from dotenv import load_dotenv, find_dotenv
import markdown
from bs4 import BeautifulSoup

In [2]:
from src.data.cosmos import GremlinQueryManager, DocumentQueryManager
from src.data.graph.gremlin import GremlinQueryBuilder

In [3]:
load_dotenv(find_dotenv())

account_name = os.environ.get('COSMOS_ACCOUNT_NAME')
db_name = os.environ.get('COSMOS_DB_NAME')
graph_name = os.environ.get('COSMOS_GRAPH_NAME')
master_key = os.environ.get('COSMOS_MASTER_KEY')

gremlin_qm = GremlinQueryManager(account_name, master_key, db_name, graph_name)
doc_qm = DocumentQueryManager(account_name, master_key, db_name)

In [4]:
azure_services = []

# abbrs = gremlin_qm.query('g.V().has("label", "cloud").values("abbreviation")')
# for abbr in abbrs:
abbr = 'azure'
q = f"""g.V().has("label", "{abbr}_service")
        .project("id", "name", "shortDescription", "longDescription", "uri", "iconUri", "categories", "relatedServices")
        .by("id").by("name").by("short_description").by("long_description").by("uri").by("icon_uri")
        .by(out("belongs_to").values("name").fold())
        .by(coalesce(out("related_service").values("name").fold(), __.not(identity()).fold()))"""
cloud_data = gremlin_qm.query(q)
azure_services += cloud_data
    
len(azure_services)

140

In [10]:
search_terms = [s['name'] for s in azure_services]
search_terms

['Azure Batch AI', 'Azure Bot Service', 'Azure Databricks', 'Azure Search']

In [8]:
patterns = []
for s in azure_services:
    pattern = []
    n = s['name']
    if '(' in n:
        i = n.index('(')
        abbr = n[i+1:n.index(')')]
        patterns.append({"label": "AZURE_SERVICE", 'pattern': [{'LOWER': abbr.lower()}]})
        n = n[:i]
    for word in n.split():
        pattern.append({'LOWER': word.lower()})
    
    patterns.append({'label': 'AZURE_SERVICE', 'pattern': pattern})
    if pattern[0]['LOWER'] == 'azure':
        patterns.append({'label': 'AZURE_SERVICE', 'pattern': pattern[1:]})
    
extra_patterns = [
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "azure"}, {"IS_UPPER": True}]},
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "azure"}, {"IS_UPPER": True}, {"IS_UPPER": True}]},
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "azure"}, {"IS_UPPER": True}, {"IS_UPPER": True}, {"IS_UPPER": True}]},
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "azure"}, {"IS_UPPER": True}, {"IS_UPPER": True}, {"IS_UPPER": True}, {"IS_UPPER": True}]},
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "app"}, {"LOWER": "service"}]},
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "cognitive"}, {"LEMMA": "service"}]},
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "azure"}, {"LEMMA": "database"}]},
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "bing"}, {"IS_ALPHA": True}, {"LOWER": "search"}]},
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "face"}, {"IS_UPPER": True}]},
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "emotion"}, {"IS_UPPER": True}]},
    {"label": "AZURE_SERVICE", "pattern": [{"LOWER": "aks"}]},
]

with open('../data/processed/azure_service_patterns_v2.jsonl', 'w+') as azure_service_patterns_file:
    for p in patterns + extra_patterns:
        azure_service_patterns_file.write(json.dumps(p) + '\n')

In [None]:
p = Path('/mnt/c/users/kakh/Documents/Innovation_Sprints/cookiecutters/skill_extractor/data')
file_paths = list(p.glob('azure-docs/**/*index.md')) + list(p.glob('azure-docs/**/*overview.md'))
file_paths = sorted(file_paths)


In [66]:
def get_examples(file_paths, n_per_service=30):
    def iter_examples(file_paths):
        for path in file_paths:

            with open(path) as md_file:
                html = markdown.markdown(md_file.read())
                soup = BeautifulSoup(html, 'html.parser')
                meta = {
                    'source': 'azure-docs',
                    'file': str(path)[str(path).index('azure-docs'):]
                }
                for p in soup.find_all('p') + soup.find_all('h1') + soup.find_all('h2'):
                    if 'ms.' not in p.text and '|' not in p.text:
                        yield {'text': ' '.join(p.text.split()).strip(), 'meta': meta}


    examples = []
    service_map_counts = defaultdict(int)
    for e in iter_examples(file_paths):
        service_dir = e['meta']['file'].split('/')[2]

        service_map_counts[service_dir] += 1

        if service_map_counts[service_dir] > n_per_service:
            continue
        else:
            examples.append(e)

    print(len(examples))
    return examples

examples = get_examples(file_paths)

2645


In [67]:
with open('../data/processed/azure_examples.jsonl', 'w+') as azure_examples_file:
    for e in examples:
        azure_examples_file.write(json.dumps(e) + '\n')