In [1]:
import dotenv
dotenv.load_dotenv("../backend/.env")

True

We need PyGithub for this script

In [None]:
%pip install PyGithub

In [2]:
import os
from github import Github
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
from tqdm import tqdm
import time
import json
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
import json
from langchain.docstore.document import Document


  from tqdm.autonotebook import tqdm


### Load all repos depending on FastAPI

Parse all pages with dependents of that repo. Takes a couple of minutes

In [None]:
# You could change the repo to any other repo that you want to use the extension for 
repo = "tiangolo/fastapi" 
url = 'https://github.com/{}/network/dependents'.format(repo)
next_exists = True
result = []
while next_exists:
    try: 
        # Get the dependents of the repo. Use beautiful soup to parse the html
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "html.parser")
        result = result + [
            "{}/{}".format(
                t.find('a', {"data-repository-hovercards-enabled":""}).text,
                t.find('a', {"data-hovercard-type":"repository"}).text
            )
            for t in soup.findAll("div", {"class": "Box-row"})
        ]
        # Proceed to the next page
        next_exists = False
        for u in soup.find("div", {"class":"paginate-container"}).findAll('a'):
            if u.text == "Next":
                next_exists = True
                url = u["href"]
        # Save the result to a file
        with open("data/_meta/dependents.json", "w") as f:
            json.dump(result, f, indent=2)
    except:
        print("Error")
        next_exists = True

    print(f"Has {len(result)} dependents")

    # Avoid rate limiting
    time.sleep(2)

# Save the result to a file
with open("data/_meta/dependents.json", "w") as f:
    json.dump(result, f, indent=2)

Load the dependents file (so that we can resume from here if we stopped before)

In [3]:
with open("data/_meta/dependents.json", "r") as f:
    repos_dependent_on_library = json.load(f)

In [4]:
print(len(repos_dependent_on_library))
print(repos_dependent_on_library[:2])

198744
['HAL9KKK/FC2', 'AndriyDykan/HWW14']


### Get top repos containing "fastapi"

We load the repositories containing the word "fastapi", sorted by stars. We actually 
want the top repos that have fastapi as dependents but there is no direct way using 
the Github API to fetch that. Iterating all repos directly and sorting them by stars 
is also not feasible due to rate-limites. 

This might be an interesting thing to revisit later on. With more patience, multiple API 
keys, or another approach, we could get a more exact representation of the top repos that
use FastAPI. 

In [5]:
github_api_key = os.environ.get('GITHUB_API_KEY')
assert github_api_key is not None, "You need to set your GITHUB_API_KEY environment variable."
g = Github(github_api_key, retry=None)

In [6]:
# Search for repositories using the specified library
query = f'{repo} in:description,readme'
result = g.search_repositories(query, sort="stars", order='desc')
result.totalCount

1000

Print the found repos as a sanity check

In [7]:
found_repos = list(result[:100])
found_repos[:2]

[Repository(full_name="tiangolo/fastapi"),
 Repository(full_name="tiangolo/full-stack-fastapi-postgresql")]

Convert repos into a DF and filter for repos that are written in Python

In [8]:
repos_df = pd.DataFrame([
    {
        "id": r.full_name,
        "stars": r.stargazers_count,
        "forks": r.forks_count,
        "watchers": r.watchers_count,
        "language": r.language,
    }
    for r in found_repos
])

print(f"Has total of {repos_df.shape[0]} repos")

repos_df = repos_df[repos_df.language == "Python"]
repos_df = repos_df[repos_df.id.apply(lambda x: x in repos_dependent_on_library)]
repos_df = repos_df.sort_values("stars", ascending=False)
repos_df = repos_df.reset_index(drop=True)
print(f"Has total of {repos_df.shape[0]} repos after filtering")

#repos_df = repos_df.head(10)

Has total of 100 repos
Has total of 46 repos after filtering


In [9]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(repos_df.sort_values(by=['stars'], ascending=False))

Unnamed: 0,id,stars,forks,watchers,language
0,tiangolo/full-stack-fastapi-postgresql,13712,2478,13712,Python
1,tiangolo/sqlmodel,11503,534,11503,Python
2,tiangolo/uvicorn-gunicorn-fastapi-docker,2407,312,2407,Python
3,ExpDev07/coronavirus-tracker-api,1600,330,1600,Python
4,ajndkr/lanarky,876,62,876,Python
5,amisadmin/fastapi-amis-admin,735,119,735,Python
6,zhanymkanov/fastapi_production_template,640,82,640,Python
7,developmentseed/titiler,613,129,613,Python
8,microsoft/cookiecutter-spacy-fastapi,467,66,467,Python
9,wxy2077/fastapi-mysql-generator,422,128,422,Python


### Download and unzip these repos

In [10]:
N_DOWNLOADS = 19  # more than 100 stars based on the above table. 
                  # You might want to change this number if you 
                  # re-run the notebook

def download_and_unzip(url, extract_to):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)

def download_repo(repo_id, extract_to):
    try: 
        # Automatically downloads main if master doesn't exist
        download_and_unzip(f"https://github.com/{repo_id}/archive/refs/heads/master.zip", extract_to=extract_to)
    except:
        print(f"Error downloading repo {repo_id}")

os.makedirs("data/raw", exist_ok=True)
download_repo(repo, extract_to="data/raw")  # Download the library itself
for i, row in tqdm(repos_df.iterrows()): 
    download_repo(row['id'], extract_to="data/raw/")
    if i >= N_DOWNLOADS:
        break

19it [00:58,  3.09s/it]
