In [4]:
import os
import pandas as pd
from pathlib import Path
import requests
from dotenv import load_dotenv, find_dotenv

In [5]:
from src.data.cosmos import GremlinQueryManager, DocumentQueryManager
from src.data.graph.gremlin import GremlinQueryBuilder

In [6]:
load_dotenv(find_dotenv())

account_name = os.environ.get('COSMOS_ACCOUNT_NAME')
db_name = os.environ.get('COSMOS_DB_NAME')
graph_name = os.environ.get('COSMOS_GRAPH_NAME')
master_key = os.environ.get('COSMOS_MASTER_KEY')
search_key = os.environ.get('AZURE_SEARCH_KEY')
search_account_name = os.environ.get('AZURE_ACCOUNT_NAME')

gremlin_qm = GremlinQueryManager(account_name, master_key, db_name, graph_name)
doc_qm = DocumentQueryManager(account_name, master_key, db_name)

In [7]:
services_data = []

abbrs = gremlin_qm.query('g.V().has("label", "cloud").values("abbreviation")')
for abbr in abbrs:
# abbr = 'azure'
    q = f"""g.V().has("label", "{abbr}_service")
            .project("id", "name", "shortDescription", "longDescription", "uri", "iconUri", "categories", "relatedServices", "cloud")
            .by("id").by("name").by("short_description").by("long_description").by("uri").by("icon_uri")
            .by(out("belongs_to").values("name").fold())
            .by(coalesce(out("related_service").id().fold(), __.not(identity()).fold()))
            .by(out("belongs_to").out("source_cloud").values("name"))"""
    cloud_data = gremlin_qm.query(q)
    services_data += cloud_data
    
len(services_data)

822

In [94]:
[s for s in services_data if s['name'] == 'Azure Kubernetes Service (AKS)']

[{'id': '1663680c-57b4-4de6-a3c4-aa82852e093a',
  'name': 'Azure Kubernetes Service (AKS)',
  'shortDescription': 'Simplify the deployment, management, and operations of Kubernetes',
  'longDescription': 'Azure Kubernetes Service (AKS) manages your hosted Kubernetes environment, making it quick and easy to deploy and manage containerized applications without container orchestration expertise. It also eliminates the burden of ongoing operations and maintenance by provisioning, upgrading, and scaling resources on demand, without taking your applications offline.',
  'uri': 'https://docs.microsoft.com/en-us/azure/aks/',
  'iconUri': 'https://docs.microsoft.com/en-us/azure/media/index/containerservice.svg',
  'categories': ['Compute', 'Containers'],
  'relatedServices': ['76c946da-b923-49cf-a1e5-24ae089d53c3',
   'f83feec0-f005-470b-a9a9-67a76f108227'],
  'cloud': 'Microsoft Azure',
  '@search.action': 'mergeOrUpload'}]

In [89]:
class AzureSearchClient:
    def __init__(self, account_name, api_key):
        self.account_name = account_name
        self.api_key = api_key
        self.default_headers = {
            'api-key': api_key
        }
        
    def run_search(search_term):
        search_url = f'https://{self.account_name}.search.windows.net/indexes/services-v0/docs/autocomplete'
        params = {
            'api-version': '2017-11-11-preview',
            'search': search_term,
            '$top': 3,
            'scoringProfile': 'boostName',
            'autocompleteMode': 'twoTerms'
        }
        
        res = requests.get(search_url, headers=self.default_headers, params=params)
        return res
        
    def upsert_index(self, index_name, fields_config, suggesters, scoring_profiles):
        kwargs = {
            'headers': self.default_headers,
            'json': {
                'name': index_name,
                'fields': fields_config,
                'suggesters': suggesters,
                'scoringProfiles': scoring_profiles
            }
        }
        delete_res = requests.delete(f"https://{self.account_name}.search.windows.net/indexes/{index_name}?api-version=2017-11-11", **kwargs)
        res = requests.post(
            f"https://{self.account_name}.search.windows.net/indexes/?api-version=2017-11-11",
            **kwargs
        )
        return res
    
    def upload_data(self, index_name, data):
        for i in range(len(data)):
            data[i]['@search.action'] = 'mergeOrUpload'
        res = requests.post(
            f"https://{self.account_name}.search.windows.net/indexes/{index_name}/docs/index?api-version=2017-11-11",
            headers=self.default_headers,
            json={
                'value': data
            }
        )
        return res
    
    def upsert_synonym_map(self, name, synonyms):
        kwargs = {
            'headers': self.default_headers,
            'json': {
                'name': name,
                'format': 'solr',
                'synonyms': synonyms
            }
        }

        res = requests.post(
            f"https://{self.account_name}.search.windows.net/synonymmaps?api-version=2017-11-11",
            **kwargs
        )
        if res.status_code > 299:
            res = requests.put(
                f"https://{self.account_name}.search.windows.net/synonymmaps/{name}?api-version=2017-11-11",
                **kwargs
            )
            
        return res
        
    
search_client = AzureSearchClient(search_account_name, search_key)

In [76]:
azure_synonyms = """
AD, Active Directory, AAD\n
AKS, Azure Kubernetes Service\n,
function, functions
database, databases
"""
search_client.upsert_synonym_map('azure-service-abbreviations', azure_synonyms).text

''

In [91]:
services_v0_field_config = [
    {"name": "id", "type": "Edm.String", "key": True, "searchable": False, "sortable": False, "facetable": False},
    {"name": "name", "type": "Edm.String", "synonymMaps":["azure-service-abbreviations"]},
    {"name": "shortDescription", "type": "Edm.String", "filterable": False, "sortable": False, "facetable": False},
    {"name": "longDescription", "type": "Edm.String", "filterable": False, "sortable": False, "facetable": False},
    {"name": "uri", "type": "Edm.String", "facetable": False},
    {"name": "iconUri", "type": "Edm.String", "facetable": False},
    {"name": "categories", "type": "Collection(Edm.String)"},
    {"name": "relatedServices", "type": "Collection(Edm.String)", "searchable": False, "filterable": False, "sortable": False, "facetable": False},
    {"name": "cloud", "type": "Edm.String", "searchable": False, "sortable": False}
]

suggesters = [  
    {  
        "name": "suggest-name",  
        "searchMode": "analyzingInfixMatching",  
        "sourceFields": ["name"]
    }  
]

scoring_profiles = [  
    {  
      "name": "boostName",  
      "text": {  
        "weights": {  
          "name": 3          
        }  
      }  
    }
]

r = search_client.upsert_index(
    'services-v0', services_v0_field_config, suggesters, scoring_profiles
)
r

<Response [201]>

In [92]:
upload_res = search_client.upload_data('services-v0', services_data)
upload_res.status_code

200

In [38]:
gremlin_qm.query('g.V("7397ee26-10f0-40a7-9e9f-393a53686e42").in("source_cloud")')

[{'id': '1a629f6c-8ca7-43f2-8eb6-76eecd7fe0cf',
  'label': 'aws_category',
  'type': 'vertex',
  'properties': {'name': [{'id': '54f59ebf-5a0a-4824-9841-10de80679f2f',
     'value': 'Analytics'}]}},
 {'id': 'fb88345e-f6ea-4508-a191-3c03563b96b0',
  'label': 'aws_category',
  'type': 'vertex',
  'properties': {'name': [{'id': '368b14a5-dd14-428c-8dda-c83424ded5e1',
     'value': 'Application Integration'}]}},
 {'id': 'bf2f3feb-82e9-411b-92e5-04df21599a39',
  'label': 'aws_category',
  'type': 'vertex',
  'properties': {'name': [{'id': '00a1890d-2057-49e0-b110-ad1c5ae7b341',
     'value': 'AR & VR'}]}},
 {'id': 'c4a12b19-c1b7-4e48-9c15-5785e363b4bd',
  'label': 'aws_category',
  'type': 'vertex',
  'properties': {'name': [{'id': '91d95bfd-29af-4713-a9ff-03322ae12094',
     'value': 'AWS Cost Management'}]}},
 {'id': 'f6a2df1a-a3a6-427e-848d-ff325c49957f',
  'label': 'aws_category',
  'type': 'vertex',
  'properties': {'name': [{'id': '55974085-532c-4ce8-96dc-5c7cb7fce324',
     'value': 

In [49]:
aws_azure_df = pd.read_csv('../data/processed/aws_azure_data_matching_output.csv')
aws_azure_related = aws_azure_df[aws_azure_df['Link Score'] > 0.6].sort_values(['Cluster ID'])

def get_svc_id(gremlin_qm, svc_name):
    q = f'g.V().has("name", "{svc_name}").values("id")'
    res = gremlin_qm.query(q)
    return res[0]

def build_related_query(from_id, to_id):
    return GremlinQueryBuilder.build_upsert_edge_query(from_id, to_id, {
        'label': 'related_service', 'related_service_score': aws_svc['Link Score']
    })

for i in range(list(aws_azure_related['Cluster ID'])[-1] + 1):
    related_services = aws_azure_related[aws_azure_related['Cluster ID'] == i].reset_index(drop=True)
    aws_svc = related_services.iloc[0]
    azure_svc = related_services.iloc[1]
    
    aws_id = get_svc_id(gremlin_qm, aws_svc['name'])
    azure_id = get_svc_id(gremlin_qm, azure_svc['name'])
    
    print(f'Adding related_service edges between {aws_svc["name"]} and {azure_svc["name"]}')

Adding related_service edges between Amazon Elastic Container Service and Azure Container Instances
Adding related_service edges between AWS Batch and Azure Batch
Adding related_service edges between Azure Kubernetes Service (AKS) and Amazon Elastic Container Service for Kubernetes
Adding related_service edges between Amazon Cloud Directory and Azure Active Directory
Adding related_service edges between Amazon API Gateway and Azure Application Gateway
Adding related_service edges between Azure Database Migration Service and AWS Database Migration Service
Adding related_service edges between Azure IoT Central and AWS IoT Analytics
Adding related_service edges between AWS Key Management Service and Azure Key Vault
Adding related_service edges between Amazon Aurora and Azure SQL Database
Adding related_service edges between Azure IoT Hub Device Provisioning Service and AWS IoT Device Management
Adding related_service edges between AWS IoT Core and Azure IoT Hub
Adding related_service edge

In [28]:
aws_azure_related.head()

Unnamed: 0,Cluster ID,Link Score,source file,category_name,name,short_description,long_description
28,0,0.996641,0,Compute,Amazon Elastic Container Service,Run and Manage Docker Containers,Amazon Elastic Container Service (Amazon ECS) ...
181,0,0.996641,1,Compute,Azure Container Instances,Easily run containers with a single command,Azure Container Instances offers the fastest a...
32,1,0.993355,0,Compute,AWS Batch,Run Batch Jobs at Any Scale,"AWS Batch enables developers, scientists, and ..."
180,1,0.993355,1,Compute,Azure Batch,Cloud-scale job scheduling and compute management,Use Batch to run large-scale parallel and high...
184,2,0.990656,1,Compute,Azure Kubernetes Service (AKS),"Simplify the deployment, management, and opera...",Azure Kubernetes Service (AKS) manages your ho...


In [83]:
azure_gcp_df = pd.read_csv('../data/processed/azure_gcp_data_matching_output.csv')
azure_gcp_related = azure_gcp_df[azure_gcp_df['Link Score'] > 0.6].sort_values(['Cluster ID'])

def get_svc_id(gremlin_qm, svc_name):
    q = f'g.V().has("name", "{svc_name}").values("id")'
    return gremlin_qm.query(q)[0]

def build_related_query(from_id, to_id, score):
    return GremlinQueryBuilder.build_upsert_edge_query(from_id, to_id, {
        'label': 'related_service', 'related_service_score': score
    })

azure_gcp_related

Unnamed: 0,Cluster ID,Link Score,source file,category_name,name,short_description,long_description
3,0,0.999976,0,Compute,Azure Functions,Process events with serverless code,Azure Functions is a serverless compute servic...
179,0,0.999976,1,Compute,Cloud Functions,Event-driven serverless compute platform.,Google Cloud Functions makes it easy for devel...
248,1,0.99978,1,Networking & Content Delivery,Cloud DNS,"Reliable, resilient, low-latency DNS serving.","Reliable, low-latency, authoritative DNS servi..."
16,1,0.99978,0,Networking & Content Delivery,Azure DNS,Host your DNS domain in Azure,"Learn how to use Azure DNS. Quickstarts, tutor..."
79,2,0.998992,0,AI + Machine Learning,Azure Machine Learning Services,"Build, deploy, and manage machine learning and...",Learn how to build intelligent algorithms into...
157,2,0.998992,1,AI + Machine Learning,Cloud Machine Learning Engine,Build superior models and deploy them into pro...,Create your predictive analytics and machine l...
130,3,0.998803,0,Management Tools,Azure Cost Management,"Optimize what you spend on the cloud, while ma...",Azure Cost Management is a cost management sol...
231,3,0.998803,1,Management tools,Cost management,"Tools for monitoring, controlling, and optimiz...",Increase cost predictability. Gain greater vis...
229,4,0.998419,1,Management tools,Cloud Shell,Command-line management from any browser.,Google Cloud Shell is a free admin machine wit...
126,4,0.998419,0,Management Tools,Azure Cloud Shell,Streamline Azure administration with a browser...,"Azure Cloud Shell is an interactive, browser-a..."


In [86]:
def update_related_services(prodigy_data_matching_output_df):
    for i in range(list(prodigy_data_matching_output_df['Cluster ID'])[-1] + 1):
        
        related_services = prodigy_data_matching_output_df[prodigy_data_matching_output_df['Cluster ID'] == i].reset_index(drop=True)
        left_svc = related_services[related_services['source file'] == 0].iloc[0]
        right_svc = related_services[related_services['source file'] == 1].iloc[0]
        
        left_id = get_svc_id(gremlin_qm, left_svc['name'])
        right_id = get_svc_id(gremlin_qm, right_svc['name'])

        
        left_related_edges = gremlin_qm.query(f"g.V('{left_id}').outE('related_service')")
        for rel_svc in left_related_edges:
            if rel_svc['inV'] != right_id:
                score = rel_svc['properties']['related_service_score']

                gremlin_qm.query(build_related_query(rel_svc['inV'], right_id, score))
                gremlin_qm.query(build_related_query(right_id, rel_svc['inV'], score))
        
        print(f'Adding related_service edges between {left_svc["name"]} and {right_svc["name"]}')
        gremlin_qm.query(build_related_query(left_id, right_id, left_svc['Link Score']))
        gremlin_qm.query(build_related_query(right_id, left_id, left_svc['Link Score']))

update_related_services(azure_gcp_related)

Adding related_service edges between Azure Functions and Cloud Functions
Adding related_service edges between Azure DNS and Cloud DNS
Adding related_service edges between Azure Machine Learning Services and Cloud Machine Learning Engine
Adding related_service edges between Azure Cost Management and Cost management
Adding related_service edges between Azure Cloud Shell and Cloud Shell
Adding related_service edges between Azure Dedicated HSM and Cloud HSM
Adding related_service edges between Azure Key Vault and Cloud Key Management Service
Adding related_service edges between Azure Security Center and Cloud Security Command Center (beta)
Adding related_service edges between Azure Visual Studio and Cloud Tools for Visual Studio
Adding related_service edges between Azure App Service and App Engine
Adding related_service edges between Azure HDInsight and Cloud Dataproc
Adding related_service edges between Azure Content Delivery Network and Cloud CDN
Adding related_service edges between Azur