In [1]:
import os
from github import Github
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
from tqdm import tqdm
import time
import json
from multiprocessing.pool import ThreadPool
from github.GithubException import RateLimitExceededException
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
import datetime
import json
import pinecone    
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone

  from tqdm.autonotebook import tqdm


### Load all repos depending on FastAPI

In [None]:

repo = "tiangolo/fastapi"
url = 'https://github.com/{}/network/dependents'.format(repo)
nextExists = True
result = []
while nextExists:
    try: 
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "html.parser")

        result = result + [
            "{}/{}".format(
                t.find('a', {"data-repository-hovercards-enabled":""}).text,
                t.find('a', {"data-hovercard-type":"repository"}).text
            )
            for t in soup.findAll("div", {"class": "Box-row"})
        ]
        nextExists = False
        for u in soup.find("div", {"class":"paginate-container"}).findAll('a'):
            if u.text == "Next":
                nextExists = True
                url = u["href"]

        with open("data/.meta/dependents.json", "w") as f:
            json.dump(result, f, indent=2)
    except:
        print("Error")
        nextExists = True

    print(f"Has {len(result)} dependents")

    time.sleep(2)

with open("data/.meta/dependents.json", "w") as f:
    json.dump(result, f, indent=2)

In [None]:
with open("data/.meta/dependents.json", "w") as f:
    json.dump(result, f, indent=2)

In [2]:
with open("data/_meta/dependents.json", "r") as f:
    repos_dependent_on_library = json.load(f)

In [3]:
print(len(repos_dependent_on_library))
print(repos_dependent_on_library[:2])

198744
['HAL9KKK/FC2', 'AndriyDykan/HWW14']


### Get top repos containing "fastapi"

In [9]:
github_api_key = os.environ.get('GITHUB_API_KEY')
assert github_api_key is not None, "You need to set your GITHUB_API_KEY environment variable."
g = Github(github_api_key, retry=None)

In [18]:
library_name = "tiangolo/fastapi"

# Search for repositories using the specified library
query = f'{library_name} in:readme,description'
result = g.search_repositories(query, sort="stars", order='desc')
result.totalCount

1000

In [19]:
found_repos = list(result[:1000])
found_repos[:2]

[Repository(full_name="tiangolo/fastapi"),
 Repository(full_name="tiangolo/full-stack-fastapi-postgresql")]

In [20]:
repos_df = pd.DataFrame([
    {
        "id": r.full_name,
        "stars": r.stargazers_count,
        "forks": r.forks_count,
        "watchers": r.watchers_count,
        "language": r.language,
    }
    for r in found_repos
])

print(f"Has total of {repos_df.shape[0]} repos")

repos_df = repos_df[repos_df.language == "Python"]
repos_df = repos_df[repos_df.id.apply(lambda x: x in repos_dependent_on_library)]
repos_df = repos_df.sort_values("stars", ascending=False)
repos_df = repos_df.reset_index(drop=True)
print(f"Has total of {repos_df.shape[0]} repos after filtering")

#repos_df = repos_df.head(10)

Has total of 1000 repos
Has total of 508 repos after filtering


In [21]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(repos_df.sort_values(by=['stars'], ascending=False))

Unnamed: 0,id,stars,forks,watchers,language
0,tiangolo/full-stack-fastapi-postgresql,13648,2477,13648,Python
1,tiangolo/sqlmodel,11423,531,11423,Python
2,tiangolo/uvicorn-gunicorn-fastapi-docker,2401,312,2401,Python
3,ExpDev07/coronavirus-tracker-api,1599,329,1599,Python
4,ajndkr/lanarky,872,62,872,Python
5,amisadmin/fastapi-amis-admin,732,118,732,Python
6,zhanymkanov/fastapi_production_template,636,80,636,Python
7,developmentseed/titiler,610,129,610,Python
8,microsoft/cookiecutter-spacy-fastapi,466,66,466,Python
9,wxy2077/fastapi-mysql-generator,422,128,422,Python


### Download and unzip these repos

In [22]:
N_DOWNLOADS = 100

def download_and_unzip(url, extract_to):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)

def download_repo(repo_id, extract_to):
    try: 
        # Automatically downloads main if master doesn't exist
        download_and_unzip(f"https://github.com/{repo_id}/archive/refs/heads/master.zip", extract_to=extract_to)
    except:
        print(f"Error downloading repo {repo_id}")

os.makedirs("data", exist_ok=True)
download_repo("tiangolo/fastapi", extract_to="data")
for i, row in tqdm(repos_df.iterrows()): 
    download_repo(row['id'], extract_to="data")

160it [03:33,  1.30it/s]

Error downloading repo MathisNcl/basketball_trainer


165it [03:45,  1.90s/it]

### Ingest the files

In [2]:
all_files = []

for root, dirs, files in os.walk("data", topdown=True):
   if root in ['data/_meta', 'data']:
      continue

   files = [os.path.join(root, f) for f in files if f.endswith('.py') or f.endswith('.md')]
   all_files += files

len(all_files), all_files[:5]

(22187,
 ['data/docker-celery-flower-master/README.md',
  'data/docker-celery-flower-master/package/README.md',
  'data/docker-celery-flower-master/package/setup.py',
  'data/docker-celery-flower-master/package/myapp/version.py',
  'data/docker-celery-flower-master/package/myapp/__init__.py'])

In [3]:
texts = []
for file in all_files: 
    with open(file) as f:
        try:  
            file_content = f.read()
            if file_content:
                texts.append(Document(page_content=file_content, metadata={"filename": file}))
        except Exception as e:
            print(f"Error reading file {file}: {e}")

Error reading file data/ml_services-main/README.md: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Error reading file data/Topic_and_user_profile_analysis_system-master/code/front_end/node_modules/@hapi/address/README.md: 'utf-8' codec can't decode byte 0x96 in position 522: invalid start byte


In [5]:
print(f"Has {len(texts)} documents")

Has 21127 documents


In [4]:
text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=800, chunk_overlap=100
)
chunks = text_splitter.split_documents(texts)

approx_tokens = sum([len(t.page_content) for t in chunks])

print(f"Embedding cost: {approx_tokens / 1000 * 0.0001}$")
print(f"Number of chunks: {len(chunks)}")

Embedding cost: 5.4912068000000005$
Number of chunks: 103192


In [53]:
pinecone_api_key = os.environ.get("PINECONE_API_KEY", None)
assert pinecone_api_key is not None, "Needs PINECONE_API_KEY"
pinecone.init(      
	api_key=pinecone_api_key,      
	environment='gcp-starter'      
)      
index = pinecone.Index('codemate')

embeddings = OpenAIEmbeddings()

index_name = "codemate"
if index_name not in pinecone.list_indexes():
    pinecone.create_index(name=index_name, metric="cosine", dimension=1536)
    
#index.delete(delete_all=True)

docsearch = Pinecone.from_documents(chunks, embeddings, index_name=index_name)