In [169]:
from dotenv import find_dotenv, load_dotenv
import os
import pandas as pd
from fuzzywuzzy import fuzz
import configparser
from azure.cosmos.cosmos_client import CosmosClient
from gremlin_python.driver import client, serializer
import requests
import pandas as pd
import re

In [146]:
aws_services = pd.read_csv('../data/raw/aws_services.csv')
aws_services.fillna('', inplace=True)
aws_services.head()

Unnamed: 0,category_id,category_name,icon,link,long_description,name,short_description
0,1,Analytics,,https://aws.amazon.com/athena/?c=1&pt=1,Amazon Athena is an interactive query service ...,Amazon Athena,Query Data in S3 using SQL
1,1,Analytics,,https://aws.amazon.com/cloudsearch/?c=1&pt=2,Related Links\nGet Started for Free\nAmazon Cl...,Amazon CloudSearch,Managed Search Service
2,1,Analytics,,https://aws.amazon.com/elasticsearch-service/?...,Amazon Elasticsearch Service is a fully manage...,Amazon Elasticsearch Service,Run and Scale Elasticsearch Clusters
3,1,Analytics,,https://aws.amazon.com/elasticmapreduce/?c=1&pt=4,Amazon EMR provides a managed Hadoop framework...,Amazon EMR,Hosted Hadoop Framework
4,1,Analytics,,https://aws.amazon.com/kinesis/?c=1&pt=5,"Amazon Kinesis makes it easy to collect, proce...",Amazon Kinesis,Work with Real-time Streaming Data


In [147]:
azure_services = pd.read_csv('../data/raw/azure_services.csv')
azure_services.fillna('', inplace=True)
azure_services.head()

Unnamed: 0,category_id,category_name,icon,link,long_description,name,short_description
0,Compute,Compute,https://docs.microsoft.com/en-us/azure/media/i...,https://docs.microsoft.com/en-us/azure/virtual...,"Provision virtual machines of Ubuntu, Red Hat,...",Azure Linux Virtual Machines,"Provision virtual machines of Ubuntu, Red Hat,..."
1,Compute,Compute,https://docs.microsoft.com/en-us/azure/media/i...,https://docs.microsoft.com/en-us/azure/virtual...,"Provision virtual machines for SQL Server, Sha...",Azure Windows Virtual Machines,"Provision virtual machines for SQL Server, Sha..."
2,Compute,Compute,https://docs.microsoft.com/en-us/azure/media/i...,https://docs.microsoft.com/en-us/azure/app-ser...,Quickly create powerful cloud apps for web and...,Azure App Service,Quickly create powerful cloud apps for web and...
3,Compute,Compute,https://docs.microsoft.com/en-us/azure/media/i...,https://docs.microsoft.com/en-us/azure/azure-f...,Process events with serverless code,Azure Functions,Process events with serverless code
4,Compute,Compute,https://docs.microsoft.com/en-us/azure/media/i...,https://docs.microsoft.com/en-us/azure/batch/,Cloud-scale job scheduling and compute management,Azure Batch,Cloud-scale job scheduling and compute management


In [148]:
google_services = pd.read_csv('../data/raw/google_services.csv')
google_services.fillna('', inplace=True)
google_services.head()

Unnamed: 0,category_name,icon,link,long_description,name,short_description
0,Compute,https://cloud.google.com/products/_static/imag...,https://cloud.google.com/compute/,Google Compute Engine offers high performance ...,Compute Engine,"Scalable, high performance VMs."
1,Compute,https://cloud.google.com/products/_static/imag...,https://cloud.google.com/appengine/,Google App Engine lets developers build scalab...,App Engine,Serverless application platform for apps and b...
2,Compute,https://cloud.google.com/products/_static/imag...,https://cloud.google.com/kubernetes-engine/,Google Kubernetes Engine is a powerful cluster...,Kubernetes Engine,Run containerized applications.
3,Compute,https://cloud.google.com/products/_static/imag...,https://cloud.google.com/gke-on-prem/,GKE On-Prem will provision and manage the unde...,GKE On-Prem,Make apps “cloud-ready” and move them to the c...
4,Compute,https://cloud.google.com/products/_static/imag...,https://cloud.google.com/functions/,Google Cloud Functions makes it easy for devel...,Cloud Functions,Event-driven serverless compute platform.


In [25]:
print('AWS Categories')
aws_categories = list(aws_services['category_name'].unique())
print(aws_categories)
print()
print('Azure Categories')
azure_categories = list(azure_services['category_name'].unique())
print(azure_categories)
print()
print('Google Cloud Categories')
google_categories = list(google_services['category_name'].unique())
print(google_categories)

AWS Categories
['Analytics', 'Application Integration', 'AR & VR', 'AWS Cost Management', 'Blockchain', 'Business Applications', 'Compute', 'Customer Engagement', 'Database', 'Desktop & App Streaming', 'Developer Tools', 'Game Tech', 'Internet of Things', 'Machine Learning', 'Management & Governance', 'Media Services', 'Migration & Transfer', 'Mobile', 'Networking & Content Delivery', 'Robotics', 'Satellite', 'Security, Identity & Compliance', 'Storage']

Azure Categories
['Compute', 'Networking', 'Storage', 'Web', 'Mobile', 'Containers', 'Databases', 'Analytics', 'AI + Machine Learning', 'Internet of Things', 'Integration', 'Identity', 'Security', 'DevOps', 'Developer Tools', 'Management Tools', 'Media', 'Migration', 'Azure Stack', 'Sovereign Clouds']

Google Cloud Categories
['Compute', 'Databases', 'Management tools', 'Apigee API Platform', 'Storage', 'Migration', 'Networking', 'Developer tools', 'Cloud IoT Core', 'Anvato', 'Data analytics', 'AI and machine learning', 'Cloud IAM', '

In [83]:
normalized_cats = {}
for aws_cat in aws_categories:
    for azure_cat in azure_categories:
        if fuzz.ratio(aws_cat, azure_cat) > 75 or aws_cat in azure_cat or azure_cat in aws_cat:
            
            if len(aws_cat) > len(azure_cat):
                norm = azure_cat
            else:
                norm = aws_cat

            normalized_cats[azure_cat] = norm
            normalized_cats[aws_cat] = norm
            
normalized_cats

{'Analytics': 'Analytics',
 'Integration': 'Integration',
 'Application Integration': 'Integration',
 'Compute': 'Compute',
 'Databases': 'Database',
 'Database': 'Database',
 'Developer Tools': 'Developer Tools',
 'Internet of Things': 'Internet of Things',
 'AI + Machine Learning': 'Machine Learning',
 'Machine Learning': 'Machine Learning',
 'Media': 'Media',
 'Media Services': 'Media',
 'Migration': 'Migration',
 'Migration & Transfer': 'Migration',
 'Mobile': 'Mobile',
 'Networking': 'Networking',
 'Networking & Content Delivery': 'Networking',
 'Identity': 'Identity',
 'Security, Identity & Compliance': 'Security',
 'Security': 'Security',
 'Storage': 'Storage'}

In [170]:
class GremlinQueryManager:
    def __init__(self, account_name, master_key, database_name, graph_name):
        self.client = client.Client(
            'wss://{}.gremlin.cosmosdb.azure.com:443/'.format(account_name),
            'g',
            username="/dbs/{}/colls/{}".format(database_name, graph_name),
            password=master_key,
            message_serializer=serializer.GraphSONSerializersV2d0()
        )

    def query(self, query):
        callback = self.client.submitAsync(query)
        if callback.result():
            res = []
            for r in callback.result():
                res += r
        return res


load_dotenv(find_dotenv())

account_name = os.environ.get('COSMOS_ACCOUNT_NAME')
db_name = os.environ.get('COSMOS_DB_NAME')
graph_name = os.environ.get('COSMOS_GRAPH_NAME')
master_key = os.environ.get('COSMOS_MASTER_KEY')

gremlin_qm = GremlinQueryManager(account_name, master_key, db_name, graph_name)

In [160]:
import re
class GremlinQueryBuilder:
    """
    Basic functions to build gremlin queries that add vertices and edges
    """
    @classmethod
    def gremlin_escape(cls, s):
        return s.replace('"', '\\"').replace('$', '\\$')
    
    @classmethod
    def build_upsert_vertex_query(cls, entity_type, properties):
        q = f"""g.V().has("label", "{entity_type}"){cls.get_properties_str(properties, False)}.
                fold().
                coalesce(unfold(),
                         addV("{entity_type}"){cls.get_properties_str(properties)})"""
        return q

    @classmethod
    def build_upsert_edge_query(cls, from_id, to_id, edge_properties):
        """
        g.V().has('person','name','vadas').as('v').
           V().has('software','name','ripple').
           coalesce(__.inE('created').where(outV().as('v')),
                    addE('created').from('v').property('weight',0.5))
        """
        label = edge_properties["label"]
        return f"""g.V("{from_id}").as('v').
                    V("{to_id}").
                    coalesce(__.inE("{label}").where(outV().as('v')),
                             addE("{label}").from('v'){cls.get_properties_str(edge_properties)})"""
    
    @classmethod
    def get_by_id_query(cls, _id):
        return 'g.V("{}")'.format(_id)
    
    @classmethod
    def get_properties_str(cls, properties, create=True):
        if create:
            query_str = 'property'
        else:
            query_str = 'has'
        
    
        properties_lower = {k.lower():v for k,v in properties.items()}
        
        if "label" in properties_lower:
            del properties_lower["label"]

        output = ""
        for k, v in properties_lower.items():
            if isinstance(v, str):
                output += '.{}("{}", "{}")'.format(query_str, k, v)
            else:
                output += '.{}("{}", {})'.format(query_str, k, v)
        return output

In [122]:
test_eq = GremlinQueryBuilder.build_upsert_edge_query('4952feb6-55dc-4d8f-9d63-4fca5c4265e3', 'b8e694c2-4e0b-4c17-b01b-b00670caaa6e', {'label': 'source_cloud'})
print(test_eq)
gremlin_qm.query(test_eq)

g.V("4952feb6-55dc-4d8f-9d63-4fca5c4265e3").as('v').
                    V("b8e694c2-4e0b-4c17-b01b-b00670caaa6e").
                    coalesce(__.inE("source_cloud").where(outV().as('v')),
                             addE("source_cloud").from('v'))


[{'id': 'b0e40429-37b6-4d0b-806e-a64163d8b1ae',
  'label': 'source_cloud',
  'type': 'edge',
  'inVLabel': 'cloud',
  'outVLabel': 'aws_category',
  'inV': 'b8e694c2-4e0b-4c17-b01b-b00670caaa6e',
  'outV': '4952feb6-55dc-4d8f-9d63-4fca5c4265e3'}]

In [157]:
for text, norm in normalized_cats.items():
    print(text, norm)
    category_res = gremlin_qm.query(f"g.V().has('name', '{norm}').has('label', 'category')")
    if category_res:
        norm_category_id = category_res[0]['id']
    else:
        vq = GremlinQueryBuilder.build_add_vertex_query('category', {'name': norm})
        norm_category_id = gremlin_qm.query(vq)[0]['id']

    cloud_source_category_id = gremlin_qm.query(f"g.V().has('name', '{text}')")[0]['id']
    
    eq = GremlinQueryBuilder.build_add_edge_query(cloud_source_category_id, norm_category_id, {'label': 'super_category'})
    gremlin_qm.query(eq)

Analytics Analytics


AttributeError: type object 'GremlinQueryBuilder' has no attribute 'build_add_vertex_query'

## Add clouds

In [128]:
sources = {
    'aws': 'Amazon Web Services', 
    'azure': 'Microsoft Azure', 
    'gcp': 'Google Cloud'
}
for abbreviation, source in sources.items():    
    q = GremlinQueryBuilder.build_upsert_vertex_query('cloud', {'name': source, 'abbreviation': abbreviation})
    gremlin_qm.query(q)

## Add cloud categories

In [127]:
for i, source in enumerate([aws_categories, azure_categories, google_categories]):
    abbr = list(sources.keys())[i]
    r = gremlin_qm.query(f"g.V().has('abbreviation', '{abbr}')")
    cloud_id = r[0]['id']
    
    for cat in source:
        vq = GremlinQueryBuilder.build_upsert_vertex_query(f'{abbr}_category', {'name': cat})
        v_id = gremlin_qm.query(vq)[0]['id']
        eq = GremlinQueryBuilder.build_upsert_edge_query(v_id, cloud_id, {'label': 'source_cloud'})
        gremlin_qm.query(eq)

## Add services for each cloud category

In [162]:
for abbr, df in zip(sources.keys(), [aws_services, azure_services, google_services]):
    source_name = sources[abbr]
    print(f"Adding services for {source_name}")
    def add_service_and_edge(row):
        label = f'{abbr}_service'
        props = {
            'name': row['name'],
            'short_description': row['short_description'],
            'long_description': row['long_description'],
            'uri': row['link'],
            'icon_uri': row['icon']
        }
        for k, v in props.items():
            props[k] = GremlinQueryBuilder.gremlin_escape(v)
        vq = GremlinQueryBuilder.build_upsert_vertex_query(label, props)
        
        v_res = gremlin_qm.query(vq)    

        cat_name = row['category_name']
        cat_id = gremlin_qm.query(f"g.V().has('name', '{cat_name}').has('label', '{abbr}_category')")[0]['id']

        cat_eq = GremlinQueryBuilder.build_upsert_edge_query(v_res[0]['id'], cat_id, {'label': 'belongs_to'})
        gremlin_qm.query(cat_eq)


    df.apply(add_service_and_edge, axis=1)

Adding services for Amazon Web Services
Adding services for Microsoft Azure
Adding services for Google Cloud
