In [21]:
import os
import pandas as pd
from pathlib import Path
import requests
from dotenv import load_dotenv, find_dotenv

In [17]:
from src.data.cosmos import GremlinQueryManager
from src.data.graph.gremlin import GremlinQueryBuilder

In [22]:
load_dotenv(find_dotenv())

account_name = os.environ.get('COSMOS_ACCOUNT_NAME')
db_name = os.environ.get('COSMOS_DB_NAME')
graph_name = os.environ.get('COSMOS_GRAPH_NAME')
master_key = os.environ.get('COSMOS_MASTER_KEY')

gremlin_qm = GremlinQueryManager(account_name, master_key, db_name, graph_name)

In [38]:
gremlin_qm.query('g.V("7397ee26-10f0-40a7-9e9f-393a53686e42").in("source_cloud")')

[{'id': '1a629f6c-8ca7-43f2-8eb6-76eecd7fe0cf',
  'label': 'aws_category',
  'type': 'vertex',
  'properties': {'name': [{'id': '54f59ebf-5a0a-4824-9841-10de80679f2f',
     'value': 'Analytics'}]}},
 {'id': 'fb88345e-f6ea-4508-a191-3c03563b96b0',
  'label': 'aws_category',
  'type': 'vertex',
  'properties': {'name': [{'id': '368b14a5-dd14-428c-8dda-c83424ded5e1',
     'value': 'Application Integration'}]}},
 {'id': 'bf2f3feb-82e9-411b-92e5-04df21599a39',
  'label': 'aws_category',
  'type': 'vertex',
  'properties': {'name': [{'id': '00a1890d-2057-49e0-b110-ad1c5ae7b341',
     'value': 'AR & VR'}]}},
 {'id': 'c4a12b19-c1b7-4e48-9c15-5785e363b4bd',
  'label': 'aws_category',
  'type': 'vertex',
  'properties': {'name': [{'id': '91d95bfd-29af-4713-a9ff-03322ae12094',
     'value': 'AWS Cost Management'}]}},
 {'id': 'f6a2df1a-a3a6-427e-848d-ff325c49957f',
  'label': 'aws_category',
  'type': 'vertex',
  'properties': {'name': [{'id': '55974085-532c-4ce8-96dc-5c7cb7fce324',
     'value': 

In [30]:
aws_azure_df = pd.read_csv('../data/processed/aws_azure_data_matching_output.csv')
aws_azure_related = aws_azure_df[aws_azure_df['Link Score'] > 0.6].sort_values(['Cluster ID'])

def get_svc_id(gremlin_qm, svc_name):
    q = f'g.V().has("name", "{svc_name}").values("id")'
    return gremlin_qm.query(q)[0]

def build_related_query(from_id, to_id):
    return GremlinQueryBuilder.build_upsert_edge_query(from_id, to_id, {
        'label': 'related_service', 'related_service_score': aws_svc['Link Score']
    })

for i in range(list(aws_azure_related['Cluster ID'])[-1] + 1):
    related_services = aws_azure_related[aws_azure_related['Cluster ID'] == i].reset_index(drop=True)
    aws_svc = related_services.iloc[0]
    azure_svc = related_services.iloc[1]
    
    aws_id = get_svc_id(gremlin_qm, aws_svc['name'])
    azure_id = get_svc_id(gremlin_qm, azure_svc['name'])
    
    print(f'Adding related_service edges between {aws_svc["name"]} and {azure_svc["name"]}')

Adding related_service edges between Amazon Elastic Container Service and Azure Container Instances
[{'id': '3e76213f-a5f6-4b5c-9d78-2bd64b582f81', 'label': 'related_service', 'type': 'edge', 'inVLabel': 'azure_service', 'outVLabel': 'aws_service', 'inV': '3ca3ca43-8f6d-4b6d-85f9-d08415beed2f', 'outV': '9bcaffb7-743b-4c91-aef6-514da1edfff1', 'properties': {'related_service_score': 0.9966404999999999}}]
[{'id': '662144a0-e1c9-4ebd-99d2-64b5ca13ed5d', 'label': 'related_service', 'type': 'edge', 'inVLabel': 'aws_service', 'outVLabel': 'azure_service', 'inV': '9bcaffb7-743b-4c91-aef6-514da1edfff1', 'outV': '3ca3ca43-8f6d-4b6d-85f9-d08415beed2f', 'properties': {'related_service_score': 0.9966404999999999}}]
Adding related_service edges between AWS Batch and Azure Batch
[{'id': '4e42a288-987e-4daf-bf79-b813ae2f055b', 'label': 'related_service', 'type': 'edge', 'inVLabel': 'azure_service', 'outVLabel': 'aws_service', 'inV': 'f7bee802-9bce-4612-b85a-3ec54e5462c3', 'outV': 'ffd93a3f-6d9f-46f8-b

In [28]:
aws_azure_related.head()

Unnamed: 0,Cluster ID,Link Score,source file,category_name,name,short_description,long_description
28,0,0.996641,0,Compute,Amazon Elastic Container Service,Run and Manage Docker Containers,Amazon Elastic Container Service (Amazon ECS) ...
181,0,0.996641,1,Compute,Azure Container Instances,Easily run containers with a single command,Azure Container Instances offers the fastest a...
32,1,0.993355,0,Compute,AWS Batch,Run Batch Jobs at Any Scale,"AWS Batch enables developers, scientists, and ..."
180,1,0.993355,1,Compute,Azure Batch,Cloud-scale job scheduling and compute management,Use Batch to run large-scale parallel and high...
184,2,0.990656,1,Compute,Azure Kubernetes Service (AKS),"Simplify the deployment, management, and opera...",Azure Kubernetes Service (AKS) manages your ho...


In [8]:
known_pairs = [
    ('AWS Lambda', 'Azure Functions')
]

for kp in known_pairs:
    aws_azure_df.loc[aws_azure_df['name'] == kp[0]]

    Cluster ID  Link Score  source file category_name        name  \
36          67         NaN            0       Compute  AWS Lambda   

                      short_description  \
36  Run your Code in Response to Events   

                                     long_description  
36  AWS Lambda lets you run code without provision...  
