# Github Topic Scraping  

Scraping all featured topics from  <a href='https://github.com/topics/'> Github topics</a>.  

In [1]:
# importing all necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd


In [2]:
#base url for featured topic url 
base_url = 'https://github.com/'

In [3]:
def get_topic_page(topic_url):
    # Downlaod individual topic page.
    response = requests.get(topic_url)

    # check sccessful resposne 
    if response.status_code != 200:
        raise Exception(f" Failed to load page{topic_url}")
    # parse the page
    topic_doc =  BeautifulSoup(response.text, 'html.parser')
    return topic_doc

    # extracting username, repository name, stars and repository url
def get_repo_info(h3_tags, star_tag):
    a_tags = h3_tags.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url+a_tags[1]['href']
    stars = parse_star_count(star_tag.text)
    return username, repo_name,stars, repo_url
# parsing star counts
def parse_star_count(star_str):
    star_str = star_str.strip()
    if star_str==None:
        return 0
    elif star_str[-1]=='k':
        # star_str = star_str[:-1] gives last item from string
        return int(float(star_str[:-1])*1000)
    else:
        return int(star_str)



def get_topic_repos(topic_doc):
    # get h3 tags containg repo title , url and repo name
    h3_tags = topic_doc.find_all('h3', {'class':'f3 color-fg-muted text-normal lh-condensed'})
    # getting repo stars
    repo_star = topic_doc.find_all('span', {'class':'Counter js-social-count'})
    # empty dictionary 
    topic_repos_dict = {
    'username':[],
    'repo_name':[],
    'stars':[],
    'repo_url':[]
    }
    # looping through get_repo_info() function to get username,repo_name, star and repo_url
    for i in range(len(h3_tags)):
        repo_info = get_repo_info(h3_tags[i],repo_star[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
        # return data as pandas dataframe.
    return  pd.DataFrame(topic_repos_dict)
        

Above function only gives four fields of github.
1. Username 
2. repository name 
3. repostory stars 
4. repository urls

In [4]:
# returning dataframe containing username, reponame, repourl and stars
get_topic_repos(get_topic_page('https://github.com//topics/3d'))

Unnamed: 0,username,repo_name,stars,repo_url
0,mrdoob,three.js,83800,https://github.com//mrdoob/three.js
1,libgdx,libgdx,20200,https://github.com//libgdx/libgdx
2,pmndrs,react-three-fiber,18800,https://github.com//pmndrs/react-three-fiber
3,BabylonJS,Babylon.js,17900,https://github.com//BabylonJS/Babylon.js
4,aframevr,aframe,14400,https://github.com//aframevr/aframe
5,ssloy,tinyrenderer,14200,https://github.com//ssloy/tinyrenderer
6,lettier,3d-game-shaders-for-beginners,13400,https://github.com//lettier/3d-game-shaders-fo...
7,FreeCAD,FreeCAD,11800,https://github.com//FreeCAD/FreeCAD
8,metafizzy,zdog,9200,https://github.com//metafizzy/zdog
9,CesiumGS,cesium,9000,https://github.com//CesiumGS/cesium


So, scraping more along with above fields: 
1. Topic name
2. Topic url name 

The following function are used.

In [5]:
# geting topic_url from parsed page content. soup= parse page content, i= position of link tag.
def topic_urls(soup,i):
    base_url = 'https://github.com/'
    topic_link_tags = soup.find_all('a',{'class':'no-underline flex-1 d-flex flex-column'})
    topic_url= base_url+topic_link_tags[i]['href']
    return topic_url


In [6]:

#getting topic_name eg. 3D, Ajax, asp.net
def get_topic_name(soup,i):
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = soup.find_all('p', {'class':selection_class })
    return topic_title_tags[i].text


In [7]:
# getting topic page whcih contains all the topics
response = requests.get('https://github.com/topics')
# parsing the page
soup = BeautifulSoup(response.text, 'html.parser')


In [8]:
# returning topic name
j = 1 # postion of topic
get_topic_name(soup, j)


'Ajax'

In [9]:
# returning topic url 
topic_urls(soup,j)

'https://github.com//topics/ajax'

In [10]:
# this block of code returns all the data 
# 1. featured topic
# 2. topic url
# 3. username of repo 
# 4. stars that got to respective repo 
# 5. repo url
def all_data(topic_doc, soup,j):
    all_data = {
        'topic':[],
        'topic_url':[],
        'username':[],
        'repo_name':[],
        'stars':[],
        'repo_url':[]
    }
    # get h3 tags containg repo title , url and repo name
    h3_tags = topic_doc.find_all('h3', {'class':'f3 color-fg-muted text-normal lh-condensed'})
    # getting repo stars
    repo_star = topic_doc.find_all('span', {'class':'Counter js-social-count'})

    for i in range(len(h3_tags)):
        repo_info = get_repo_info(h3_tags[i],repo_star[i])
        all_data['topic'].append(get_topic_name(soup,j))
        all_data['topic_url'].append(topic_urls(soup,j))
        all_data['username'].append(repo_info[0])
        all_data['repo_name'].append(repo_info[1])
        all_data['stars'].append(repo_info[2])
        all_data['repo_url'].append(repo_info[3])
    return all_data


In [15]:
github_data =[]
# n could be total length of scraped topics. 29 topics are scraped.  
n = 15 
# looping through all_data() function and converting to dataframe and appending returned data to empty list 
for j in range(n):
    github_data.append(pd.DataFrame(all_data(get_topic_page(topic_urls(soup,j)), soup,j)))

In [16]:
# combining multiple dataframe with ignoring index.

result = pd.concat(github_data, ignore_index=True)

In [17]:
# converting data into csv.
result.to_csv('topic_repo_user_star.csv')

In [19]:
result.shape

(450, 6)

In [21]:
result[result['topic']=='3D']

Unnamed: 0,topic,topic_url,username,repo_name,stars,repo_url
0,3D,https://github.com//topics/3d,mrdoob,three.js,83800,https://github.com//mrdoob/three.js
1,3D,https://github.com//topics/3d,libgdx,libgdx,20200,https://github.com//libgdx/libgdx
2,3D,https://github.com//topics/3d,pmndrs,react-three-fiber,18800,https://github.com//pmndrs/react-three-fiber
3,3D,https://github.com//topics/3d,BabylonJS,Babylon.js,17900,https://github.com//BabylonJS/Babylon.js
4,3D,https://github.com//topics/3d,aframevr,aframe,14400,https://github.com//aframevr/aframe
5,3D,https://github.com//topics/3d,ssloy,tinyrenderer,14200,https://github.com//ssloy/tinyrenderer
6,3D,https://github.com//topics/3d,lettier,3d-game-shaders-for-beginners,13400,https://github.com//lettier/3d-game-shaders-fo...
7,3D,https://github.com//topics/3d,FreeCAD,FreeCAD,11800,https://github.com//FreeCAD/FreeCAD
8,3D,https://github.com//topics/3d,metafizzy,zdog,9200,https://github.com//metafizzy/zdog
9,3D,https://github.com//topics/3d,CesiumGS,cesium,9000,https://github.com//CesiumGS/cesium
