## Exemplo de obtenção de informação dos repositórios

In [None]:
import requests
import json

# URLs com filtro para busca no github
search_links = [
    "https://api.github.com/search/repositories?q=serverless+language:python&sort=stars&order=desc&page={}",
    "https://api.github.com/search/repositories?q=serverless+language:javascript&sort=stars&order=desc&page={}",
    "https://api.github.com/search/repositories?q=function+as+a+service+language:python&sort=stars&order=desc&page={}",
    "https://api.github.com/search/repositories?q=function+as+a+service+language:javascript&sort=stars&order=desc&page={}",
    "https://api.github.com/search/repositories?q=faas+language:python&sort=stars&order=desc&page={}",
    "https://api.github.com/search/repositories?q=faas+language:javascript&sort=stars&order=desc&page={}"
]

repos_info = []

for search in search_links:
    
    print(f"Requesting url {search}")
    
    page = 0
    
    while(True):
        print(f"page {page}")
        
        api_result = requests.get(search.format(page))
        if api_result.status_code != 200:
            print(api_result.text)
            break

        repos = json.loads(api_result.text)
        
        # Armazena as informacoes relevantes do repositorio em um objeto
        for repo in repos["items"]:
            relevant_repo_info = {
                "description": repo["description"],
                "title": repo["full_name"],
                "url": repo["html_url"],
                "language": repo["language"],
                "owner_url": repo["owner"]["html_url"],
                "clone_url": repo["clone_url"],
                "included": True
            }

            # Logica para exclusao de possiveis falsos positivos
            readme_content = requests.get(
                relevant_repo_info["url"].replace(
                    "https://github.com", 
                    "https://raw.githubusercontent.com"
                ) 
                + "/master/README.md"
            ).text

            if "framework" in readme_content or "plugin" in readme_content:
                relevant_repo_info["included"] = False

            repos_info.append(relevant_repo_info)
        page += 1
repos_info

## Extração de métricas dos repositórios

In [None]:
import os
import stat
import re
import shutil
from git import Repo
import pprint

pp = pprint.PrettyPrinter(indent=4)

def is_config_file(file):
    config_file_rules = [".json", "config.", ".yml", ".conf", ".config", "settings."]
    return any(rule in file for rule in config_file_rules)

def get_func_names(file, language):
    try:
        with open(file, encoding="utf-8") as handle:
            content = handle.read()
    except UnicodeDecodeError as ex:
        return []
        
    if language == "Python":
        regex = "def (.*)\("
    elif language == "JavaScript":
        regex = "exports\.(.*) = .*"

    matches = re.findall(regex, content)

    return matches

def get_func_params(file, language):
    if language == "Python":
        regex = "def .*\((.*)\).*:"
    elif language == "JavaScript":
        regex = "exports\..* = .*\((.*)\)"
        
    try:
        with open(file) as handle:
            content = handle.read()
    except UnicodeDecodeError as ex:
        return []
        
    matches = re.findall(regex, content)
    split_matches = [match.split(",") for match in matches]
    
    return split_matches
    
def get_file_size(file):
    try:
        with open(file) as f:
            lines = sum(1 for line in f)
    except UnicodeDecodeError as ex:
        return 0
    
    return lines

def get_comment_amount(file, language):
    try:
        with open(file) as handle:
            content = handle.read()
    except UnicodeDecodeError as ex:
        return 0
    
    if language == "Python":
        comments = len(re.findall("^#.*$", content, flags=re.M))
    elif language == "JavaScript":
        comments = len(re.findall("^\/\*.*$", content, flags=re.M))
        comments += len(re.findall("^\/\/.*$", content, flags=re.M))
    
    return comments

def try_detect_provider(file, language):
    try:
        with open(file) as handle:
            content = handle.read()
    except UnicodeDecodeError as ex:
        return []
    
    if language == "Python":
        if "boto3" in content:
            return "AWS"
        if "azure.functions" in content or "function.json" in file:
            return "AZURE"
        if "google.cloud" in content:
            return "GCP"
    elif language == "JavaScript":
        if "aws-sdk" in content or "exports.handler" in content:
            return "AWS"
        if "azure/functions" in content or "function.json" in file:
            return "AZURE"
        if "google-cloud" in content:
            return "GCP"       
    
    return None
        

def get_metrics(target_repo):
    metricas = {}
    max_depth = 0
    total_folders = 0
    total_files = 0
    code_files_per_folder = []
    name_size_per_func = []
    params_per_func = []
    total_config_files = 0
    amount_external_libs = 0
    amount_lines_per_code_file = []
    amount_lines_per_config_file = []
    amount_funcs_per_file = []
    provider = None
    amount_comments_per_file = []

    for dir_name, subdirs, files in os.walk("repo_teste"):
        ## da pra pensar em excluir o readme e gitignore dos arquivos se relevante
        if ".git" in dir_name:
            continue

        if "dir_name" == "repo_teste":
            metricas["root_folder_amount"] = len(subdirs)
            metricas["root_file_amount"] = len(files)

        depth = len(dir_name.split("/"))-1
        if len(dir_name.split("/"))-1 >= max_depth:
            max_depth = depth

        total_files += len(files)
        total_folders += len(subdirs) if ".git" not in subdirs else len(subdirs)-1

        config_files = [file for file in files if is_config_file(file)]
        total_config_files =+ len(config_files)

        for config_file in config_files:
            amount_lines_per_config_file.append(get_file_size(dir_name + "/" +config_file))
            if provider == None:
                provider = try_detect_provider(dir_name + "/" +config_file, target_repo["language"])

        if target_repo["language"] == "Python":
            code_files = [file for file in files if ".py" in file]

            for code_file in code_files:
                func_names = get_func_names(dir_name + "/" + code_file, "Python")
                func_params = get_func_params(dir_name + "/" + code_file, "Python")
                name_size_per_func = name_size_per_func + [len(name) for name in func_names]
                params_per_func = params_per_func + [len(params) for params in func_params]
                amount_funcs_per_file.append(len(func_names))
                amount_lines_per_code_file.append(get_file_size(dir_name + "/" +code_file))
                amount_comments_per_file.append(get_comment_amount(dir_name + "/" + code_file, "Python"))
                
                if provider == None:
                    provider = try_detect_provider(dir_name + "/" + code_file, "Python")

            amount_code_files = len(code_files)
            code_files_per_folder.append(amount_code_files)

            if "requirements.txt" in files:
                amount_external_libs = get_file_size(dir_name + "/" + 'requirements.txt')

        elif target_repo["language"] == "JavaScript":
            code_files = [file for file in files if ".js" in file ]
            
            for code_file in code_files:
                func_names = get_func_names(dir_name + "/" + code_file, "JavaScript")
                func_params = get_func_params(dir_name + "/" + code_file, "JavaScript")
                name_size_per_func = name_size_per_func + [len(name) for name in func_names]
                params_per_func = params_per_func + [len(params) for params in func_params]
                amount_funcs_per_file.append(len(func_names))
                amount_lines_per_code_file.append(get_file_size(dir_name + "/" +code_file))
                amount_comments_per_file.append(get_comment_amount(dir_name + "/" + code_file, "JavaScript"))
                
                if provider == None:
                    provider = try_detect_provider(dir_name + "/" + code_file, "Python")
            
            amount_code_files = len(code_files)
            code_files_per_folder.append(amount_code_files)
            
            if "package.json" in files:
                amount_external_libs = get_file_size(dir_name + "/" + 'package.json')
            
    metricas["max_depth"] = max_depth
    metricas["total_files"] = total_files
    metricas["total_folders"] = total_folders
    metricas["total_code_files"] = sum(code_files_per_folder)
    metricas["avg_code_files_per_folder"] = metricas["total_code_files"] / metricas["total_folders"]
    metricas["total_funcs"] = len(name_size_per_func)
    metricas["avg_func_name_length"] = sum(name_size_per_func) / len(name_size_per_func)
    metricas["avg_func_param_amount"] = sum(params_per_func) / len(params_per_func)
    metricas["external_lib_amount"] = amount_external_libs
    metricas["avg_lines_code_files"] = sum(amount_lines_per_code_file) / len(amount_lines_per_code_file)
    metricas["total_code_lines"] = sum(amount_lines_per_code_file)
    metricas["avg_lines_config_files"] = sum(amount_lines_per_config_file) / len(amount_lines_per_config_file) if len(amount_lines_per_config_file) != 0 else 0
    metricas["avg_func_name_length"] = sum(name_size_per_func) / len(name_size_per_func)
    metricas["max_code_files_per_folder"] = max(code_files_per_folder)
    metricas["min_code_files_per_folder"] = min(code_files_per_folder)
    metricas["max_func_name_length"] = max(name_size_per_func)
    metricas["min_func_name_length"] = min(name_size_per_func)
    metricas["max_lines_code_files"] = max(amount_lines_per_code_file)
    metricas["min_lines_code_files"] = min(amount_lines_per_code_file)
    metricas["max_func_param_amount"] = max(params_per_func)
    metricas["min_func_param_amount"] = min(params_per_func)
    metricas["max_lines_config_files"] = max(amount_lines_per_config_file) if len(amount_lines_per_config_file) != 0 else 0
    metricas["min_lines_config_files"] = min(amount_lines_per_config_file) if len(amount_lines_per_config_file) != 0 else 0
    metricas["avg_funcs_per_code_file"] = metricas["total_funcs"] / metricas["total_code_files"]
    metricas["max_funcs_per_code_file"] = max(amount_funcs_per_file)
    metricas["min_funcs_per_code_file"] = min(amount_funcs_per_file)
    metricas["avg_comments_per_file"] = sum(amount_comments_per_file) / len(amount_comments_per_file)
    metricas["max_comments_per_file"] = max(amount_comments_per_file)
    metricas["min_comments_per_file"] = min(amount_comments_per_file)
    metricas["total_comments"] = sum(amount_comments_per_file)
    metricas["total_config_files"] = len(amount_lines_per_config_file)
    metricas["provider"] = provider
    metricas["language"] = target_repo["language"]
    return metricas
        
def clone_repo(repo_url):
    repo = Repo.clone_from(target_repo["clone_url"], "./repo_teste")
    repo
    
def onerror(func, path, exc_info):
    """
    Error handler for ``shutil.rmtree``.

    If the error is due to an access error (read only file)
    it attempts to add write permission and then retries.

    If the error is for another reason it re-raises the error.
    
    Usage : ``shutil.rmtree(path, onerror=onerror)``
    """
    import stat
    # Is the error an access error?
    if not os.access(path, os.W_OK):
        os.chmod(path, stat.S_IWUSR)
        func(path)
    else:
        raise
    
def clear_repo():
    shutil.rmtree("./repo_teste", onerror=onerror)

## Execução de extração para repositórios selecionados

In [None]:
import pandas as pd
df_metricas = pd.DataFrame()

In [None]:
# Definicao dos repositorios selecionados para estudo, no mesmo formato
# de dados obtidos pela proposta de codigo de extracao de informacao
# de repositorios do github
target_repos = [
    {
        'description': 'honeyλ - a simple, serverless application designed to create and monitor fake HTTP endpoints (i.e. URL honeytokens) automatically, on top of AWS Lambda and Amazon API Gateway',
         'title': '0x4D31/honeyLambda',
         'url': 'https://github.com/0x4D31/honeyLambda',
         'language': 'Python',
         'owner_url': 'https://github.com/0x4D31',
         'clone_url': 'https://github.com/0x4D31/honeyLambda.git',
         'included': True
    },
    {
        'description': 'Serverless email forwarding using AWS Lambda and SES',
        'title': 'arithmetric/aws-lambda-ses-forwarder',
        'url': 'https://github.com/arithmetric/aws-lambda-ses-forwarder',
        'language': 'JavaScript',
        'owner_url': 'https://github.com/arithmetric',
        'clone_url': 'https://github.com/arithmetric/aws-lambda-ses-forwarder.git',
        'included': True
    },
    {
        'description': 'A sample authentication service implemented with a server-less architecture, using AWS Lambda to host and execute the code and Amazon DynamoDB as persistent storage. This provides a cost-efficient solution that is scalable and highly available and can be used with Amazon Cognito for Developer Authenticated Identities.',
        'title': 'danilop/LambdAuth',
        'url': 'https://github.com/danilop/LambdAuth',
        'language': 'JavaScript',
        'owner_url': 'https://github.com/danilop',
        'clone_url': 'https://github.com/danilop/LambdAuth.git',
        'included': True
    }, 
    {
        'description': 'The Web Application reference architecture is a general-purpose, event-driven, web application back-end that uses AWS Lambda, Amazon API Gateway for its business logic. It also uses Amazon DynamoDB as its database and Amazon Cognito for user management. All static content is hosted using AWS Amplify Console.',
        'title': 'aws-samples/lambda-refarch-webapp',
        'url': 'https://github.com/aws-samples/lambda-refarch-webapp',
        'language': 'JavaScript',
        'owner_url': 'https://github.com/aws-samples',
        'clone_url': 'https://github.com/aws-samples/lambda-refarch-webapp.git',
        'included': True
    },
    {
        'description': 'A solution to dynamically handle images on the fly, utilizing SharpJS',
        'title': 'aws-solutions/serverless-image-handler',
        'url': 'https://github.com/aws-solutions/serverless-image-handler',
        'language': 'JavaScript',
        'owner_url': 'https://github.com/aws-solutions',
        'clone_url': 'https://github.com/aws-solutions/serverless-image-handler.git',
        'included': True
    },
    {
        'description': 'A Serverless Blog leveraging GraphQL to offer a REST API with only 1 endpoint using Serverless v0.5',
        'title': 'serverless/serverless-graphql-blog',
        'url': 'https://github.com/serverless/serverless-graphql-blog',
        'language': 'JavaScript',
        'owner_url': 'https://github.com/serverless',
        'clone_url': 'https://github.com/serverless/serverless-graphql-blog.git',
        'included': True
    },
    {
        'description': 'Repository for BLESS, an SSH Certificate Authority that runs as a AWS Lambda function',
        'title': 'Netflix/bless',
        'url': 'https://github.com/Netflix/bless',
        'language': 'Python',
        'owner_url': 'https://github.com/Netflix',
        'clone_url': 'https://github.com/Netflix/bless.git',
        'included': True
    },
    {
        'description': 'Repo of AWS Lambda and Azure Functions functions that process streams and send data to Datadog',
        'title': 'DataDog/datadog-serverless-functions',
        'url': 'https://github.com/DataDog/datadog-serverless-functions',
        'language': 'Python',
        'owner_url': 'https://github.com/DataDog',
        'clone_url': 'https://github.com/DataDog/datadog-serverless-functions.git',
        'included': True
    },
    {
        'description': 'Open source application to instantly remediate common security issues through the use of AWS Config',
        'title': 'servian/aws-auto-remediate',
        'url': 'https://github.com/servian/aws-auto-remediate',
        'language': 'Python',
        'owner_url': 'https://github.com/servian',
        'clone_url': 'https://github.com/servian/aws-auto-remediate.git',
        'included': True
    },
    {
        'description': 'Serverless pypi',
        'title': 'khornberg/elasticpypi',
        'url': 'https://github.com/khornberg/elasticpypi',
        'language': 'Python',
        'owner_url': 'https://github.com/khornberg',
        'clone_url': 'https://github.com/khornberg/elasticpypi.git',
        'included': True
    },
    {
        'description': 'Automated serverless logging to S3 via SQS.',
        'title': 'ellimilial/sqs-s3-logger',
        'url': 'https://github.com/ellimilial/sqs-s3-logger',
        'language': 'Python',
        'owner_url': 'https://github.com/ellimilial',
        'clone_url': 'https://github.com/ellimilial/sqs-s3-logger.git',
        'included': True
    },
    {
        'description': 'FastAI PyTorch Serverless API (w/ AWS Lambda)',
        'title': 'alecrubin/pytorch-serverless',
        'url': 'https://github.com/alecrubin/pytorch-serverless',
        'language': 'Python',
        'owner_url': 'https://github.com/alecrubin',
        'clone_url': 'https://github.com/alecrubin/pytorch-serverless.git',
        'included': True
    },
    {
        'description': 'Serverless App that publishes CodeBuild build logs to a publicly accessible location',
        'title': 'jlhood/github-codebuild-logs',
        'url': 'https://github.com/jlhood/github-codebuild-logs',
        'language': 'Python',
        'owner_url': 'https://github.com/jlhood',
        'clone_url': 'https://github.com/jlhood/github-codebuild-logs.git',
        'included': True
    }
]

In [None]:
for target_repo in target_repos:
    clone_repo([target_repo["clone_url"]])
    metricas = get_metrics(target_repo)
    entrada = {**target_repo, **metricas}
    df_metricas = df_metricas.append(entrada, ignore_index=True)
    clear_repo()

In [None]:
df_metricas = df_metricas.round(decimals=1)
# Exportacao em formato excel caso necessario
# df_metricas.to_csv("metricas_repos.csv", sep=";", index=False)
# print(df_metricas.to_csv(sep=";", index=False))
df_metricas