In [1]:
import os
import json
import requests
from bs4 import BeautifulSoup
import markdown
from dotenv import load_dotenv, find_dotenv

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.


In [2]:
from src.data.cosmos import GremlinQueryManager, DocumentQueryManager
from src.data.graph.gremlin import GremlinQueryBuilder

## Patterns

In [3]:
load_dotenv(find_dotenv())

account_name = os.environ.get('COSMOS_ACCOUNT_NAME')
db_name = os.environ.get('COSMOS_DB_NAME')
graph_name = os.environ.get('COSMOS_GRAPH_NAME')
master_key = os.environ.get('COSMOS_MASTER_KEY')

gremlin_qm = GremlinQueryManager(account_name, master_key, db_name, graph_name)
doc_qm = DocumentQueryManager(account_name, master_key, db_name)

In [36]:
services = []

abbr = 'aws'
q = f"""g.V().has("label", "{abbr}_service")
        .project("id", "name", "shortDescription", "longDescription", "uri", "iconUri", "categories", "relatedServices")
        .by("id").by("name").by("short_description").by("long_description").by("uri").by("icon_uri")
        .by(out("belongs_to").values("name").fold())
        .by(coalesce(out("related_service").values("name").fold(), __.not(identity()).fold()))"""
cloud_data = gremlin_qm.query(q)
services += cloud_data
    
len(services)

176

In [41]:
patterns = []
for s in services:
    pattern = []
    n = s['name']
    if '(' in n:
        i = n.index('(')
        abbr = n[i+1:n.index(')')]
        patterns.append({"label": "AWS_SERVICE", 'pattern': [{'LOWER': abbr.lower()}]})
        n = n[:i]
    for word in n.split():
        pattern.append({'LOWER': word.lower()})
    
    patterns.append({'label': 'AWS_SERVICE', 'pattern': pattern})
    
extra_patterns = [
    {"label": "AWS_SERVICE", "pattern": [{"ORTH": "EMR"}]},
    {"label": "AWS_SERVICE", "pattern": [{"LOWER": "lambda"}]},
    {"label": "AWS_SERVICE", "pattern": [{"LOWER": "amazon"}, {"IS_UPPER": True}]}
]

with open('../data/processed/aws_service_patterns.jsonl', 'w+') as service_patterns_file:
    for p in patterns + extra_patterns:
        service_patterns_file.write(json.dumps(p) + '\n')

## Data

In [44]:
repo_urls = []
for i in range(1, 4):
    url = f'https://github.com/awsdocs?language=&page={i}&q=user-guide&type=&utf8=%E2%9C%93'
    res = requests.get(url, headers={"X-Requested-With": "XMLHttpRequest"})
    soup = BeautifulSoup(res.text, 'html.parser')
    repo_soups = soup.find_all('a', {'itemprop': "name codeRepository"})
    
    for rs in repo_soups:
        repo_urls.append(rs['href'])
        

        
print(repo_urls[:10])

['/awsdocs/aws-cloudformation-user-guide', '/awsdocs/amazon-ec2-user-guide', '/awsdocs/iam-user-guide', '/awsdocs/amazon-rds-user-guide', '/awsdocs/amazon-eks-user-guide', '/awsdocs/aws-cli-user-guide', '/awsdocs/aws-systems-manager-user-guide', '/awsdocs/amazon-vpc-user-guide', '/awsdocs/amazon-athena-user-guide', '/awsdocs/amazon-cloudwatch-user-guide']


In [52]:
examples = []
for repo_url in repo_urls:
    
    
    try:
        url = f"https://raw.githubusercontent.com{repo_url}/master/doc_source/index.md"
        index_res = requests.get(url)
        index_res.raise_for_status()
        html = markdown.markdown(index_res.text)
        soup = BeautifulSoup(html, 'html.parser')
        meta = {
            'source': repo_url,
            'file': url
        }
        for a in soup.find_all('a')[:14]:
            url = f"https://raw.githubusercontent.com{repo_url}/master/doc_source/{a['href']}"
            file_res = requests.get(url)
            file_res.raise_for_status()
            html = markdown.markdown(file_res.text)
            soup = BeautifulSoup(html, 'html.parser')
            for p in soup.find_all('p') + soup.find_all('h1') + soup.find_all('h2'):
                if 'ms.' not in p.text and '|' not in p.text:
                    examples.append({'text': ' '.join(p.text.split()).strip(), 'meta': meta})
    except Exception as e:
        print(e)

print(len(examples))

404 Client Error: Not Found for url: https://raw.githubusercontent.com/awsdocs/aws-powershell-user-guide/master/doc_source/index.md
404 Client Error: Not Found for url: https://raw.githubusercontent.com/awsdocs/amazon-sumerian-user-guide/master/doc_source/index.md
404 Client Error: Not Found for url: https://raw.githubusercontent.com/awsdocs/aws-toolkit-eclipse-user-guide/master/doc_source/index.md
404 Client Error: Not Found for url: https://raw.githubusercontent.com/awsdocs/aws-toolkit-visual-studio-user-guide/master/doc_source/index.md
404 Client Error: Not Found for url: https://raw.githubusercontent.com/awsdocs/amazon-pinpoint-user-guide/master/doc_source/index.md
404 Client Error: Not Found for url: https://raw.githubusercontent.com/awsdocs/aws-xray-developer-guide/master/doc_source/index.md
404 Client Error: Not Found for url: https://raw.githubusercontent.com/awsdocs/aws-amplify-console-user-guide/master/doc_source/index.md
404 Client Error: Not Found for url: https://raw.githu

In [53]:
print(len(examples))

26745


In [27]:
with open('../data/processed/aws_examples.jsonl', 'w+') as examples_file:
    for e in examples:
        examples_file.write(json.dumps(e) + '\n')