In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# get repo name
def get_repo_name(doc):
    repo_head_tag = doc.find_all('a', {'itemprop': 'name codeRepository'})
    repo_names = []
    for i in range(len(repo_head_tag)):
        repo_names.append(repo_head_tag[i].text.strip())
    return repo_names

# get stars
def get_stars(doc):
    star_fork_div_tag = doc.find_all('div', {'class': 'f6 color-fg-muted mt-2'})
    stars = []
    for i in range(len(star_fork_div_tag)):
        star_fork_content = str(star_fork_div_tag[i])
        star_ = BeautifulSoup(star_fork_content, 'html.parser')
        star = star_.find_all('a', {'class': 'Link--muted mr-3'})
        if len(star) > 0:
            stars.append(star[0].text.strip())
        else:
            stars.append(0)
    return stars

# get forks
def get_forks(doc):
    star_fork_div_tag = doc.find_all('div', {'class': 'f6 color-fg-muted mt-2'})
    forks = []
    for i in range(len(star_fork_div_tag)):
        star_fork_content = str(star_fork_div_tag[i])
        fork_ = BeautifulSoup(star_fork_content, 'html.parser')
        fork = fork_.find_all('a', {'class': 'Link--muted mr-3'})
        if len(fork) > 1:
            forks.append(fork[1].text.strip())
        else:
            forks.append(0)
    return forks

# get repo url
def get_repo_url(doc,base_url):
    repo_name = get_repo_name(doc)
    repo_url = []
    repo_head_tag = doc.find_all('a', {'itemprop': 'name codeRepository'})
    for i in range(len(repo_head_tag)):
        repo_url.append(base_url + repo_name[i])
    return repo_url


def scrape_github_id(repo_url):
    idx = 0

    while repo_url[idx] != '?':
        idx = idx + 1

    base_url = repo_url[:idx] + '/'

    response = requests.get(repo_url)
    if response.status_code == 200:
        page_contents = response.text
        doc = BeautifulSoup(page_contents, 'html.parser')
        repos = {'Repository Name': get_repo_name(doc), 'Stars': get_stars(doc), 'Forks': get_forks(doc), 'URL': get_repo_url(doc,base_url)}
        return pd.DataFrame(repos)
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

def Mega_scrape(repo_url):
    idx = 0

    while repo_url[idx] != '?':
        idx = idx + 1

    if '&' not in repo_url:
        repo_url = repo_url[0:idx+1] + '&' + repo_url[idx+1:]
    
    initials = repo_url[0:idx+1]
    finals = repo_url[idx+1:]

    response = requests.get(repo_url)
    page_contents = response.text
    doc = BeautifulSoup(page_contents, 'html.parser')
    repo_tags = doc.find_all('span',{'class':'Counter'})

    no_of_repo = repo_tags[0].text
    no_of_pages = int(no_of_repo)/30
    
    dfs = []
    
    for i in range(int(no_of_pages + 1)):
        print('Scraping {} Repository Page'.format(i + 1))
        new_repo_url = initials + 'page={}'.format(i + 1) + finals
        df = scrape_github_id(new_repo_url)
        dfs.append(df)

    final_df = pd.concat(dfs, ignore_index=True)
    
    final_df.to_csv('repository_info.csv')

In [7]:
repo_url = 'https://github.com/john-smilga?tab=repositories'

In [8]:
Mega_scrape(repo_url)

Scraping 1 Repository Page
Scraping 2 Repository Page
Scraping 3 Repository Page
Scraping 4 Repository Page
Scraping 5 Repository Page
Scraping 6 Repository Page
Scraping 7 Repository Page
Scraping 8 Repository Page
Scraping 9 Repository Page
