# 'Topics on GitHub' Scrapper

<img src = 'https://i.imgur.com/Uig9ymG.png'>

# Importing libraries

In [1]:
import os
import time
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup as bs

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.core.utils import ChromeType
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromiumService

# Generating All featured topics

In [2]:
def call_webdriver():
    return webdriver.Chrome(service=ChromiumService(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()))

In [3]:
# https://pythonbasics.org/selenium-scroll-down/

def scroll_to_bottom():
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")

In [4]:
BASE_URL = 'https://github.com'

if not os.path.isfile('data/topics.csv'):
    
    print('Opening selenium crawler')
    
    driver = call_webdriver()
    driver.get(BASE_URL + '/topics')
    driver.maximize_window()
    scroll_to_bottom()

    try:
        while True:
            time.sleep(2)
            driver.find_element(By.XPATH, "//button[@type = 'submit']").click()
            time.sleep(3)
            scroll_to_bottom()
    except:
        pass
    
    soup = bs(driver.page_source, 'lxml')
    driver.quit()

    # print(soup.prettify())
else:
    print("'topics.csv' already exisits, not using crawler")

'topics.csv' already exisits, not using crawler


In [5]:
if not os.path.isfile('data/topics.csv'):
    
    print("Data file doesn't exists, scrapping data")
    all_topics = soup.find_all('div', {'class' : 'py-4 border-bottom d-flex flex-justify-between'})

    links, titles, descs = [], [], []
    for topic in all_topics:

        topic_box = topic.find('a', {'class' : 'no-underline flex-1 d-flex flex-column'})
        links.append(BASE_URL + topic_box['href'])
        titles.append(topic_box.select('p')[0].text.strip())
        descs.append(topic_box.select('p')[1].text.strip())

    topic_df = pd.DataFrame({'title' : titles, 'link' : links, 'descrip' : descs})
    topic_df.to_csv('data/topics.csv', index = False)
    print(f'Number of topic titles collected : {len(topic_df)}\n')
else:
    print("Loading saved 'topics.csv'")
    topic_df = pd.read_csv('data/topics.csv')
    print(f'Number of topic titles collected : {len(topic_df)}\n')

topic_df.head()

Loading saved 'topics.csv'
Number of topic titles collected : 180



Unnamed: 0,title,link,descrip
0,3D,https://github.com/topics/3d,3D modeling is the process of virtually develo...
1,Ajax,https://github.com/topics/ajax,Ajax is a technique for creating interactive w...
2,Algorithm,https://github.com/topics/algorithm,Algorithms are self-contained sequences that c...
3,Amp,https://github.com/topics/amphp,Amp is a non-blocking concurrency library for ...
4,Android,https://github.com/topics/android,Android is an operating system built by Google...


# Generating repository details

In [None]:
if not os.path.isfile('data/repo_details.csv'):

    driver = call_webdriver()
    driver.maximize_window()

    topic_repo_df = pd.DataFrame({'topic' : [], 'user_name' : [], 'repo_name' : [], 'repo_link' : [], 'start_count' : []})
    not_120 = []
    
    for t_link in tqdm(topic_df.link):
        driver.get(t_link)
        try:
            count = 5
            while count:
                scroll_to_bottom()
                time.sleep(2)
                driver.find_element(By.XPATH, "//button[@type = 'submit']").click()
                time.sleep(2)
                scroll_to_bottom()
                time.sleep(2)
                count -= 1
        except:
            not_120.append(t_link)
        
        soup = bs(driver.page_source, 'lxml')
        all_div_box = soup.find_all('div', {'class' : 'd-flex flex-justify-between flex-items-start flex-wrap gap-2 my-3'})

        for div_box in all_div_box:
            topic = t_link.split("/")[-1]
            user_name = div_box.select('h3 a')[0].text.strip()
            repo_name = div_box.select('h3 a')[1].text.strip()
            repo_link = BASE_URL + div_box.select('h3 a')[1]['href'].strip()
            star_count = div_box.find('span', class_ = 'Counter js-social-count').text

            topic_repo_df.loc[len(topic_repo_df)] = [topic, user_name, repo_name, repo_link, star_count]

    driver.quit()
    topic_repo_df.to_csv('data/repo_details.csv', index = False)
    print(f'Number of repo details collected : {len(topic_repo_df)}\n')
    
else:
    print("Loading saved 'repo_details.csv'")
    topic_repo_df = pd.read_csv('data/repo_details.csv')
    print(f'Number of repo details collected : {len(topic_repo_df)}\n')

topic_repo_df.head()