In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [None]:
def get_pages(page):
    reqUrl = f"https://github.com/topics?page={page}"
    return reqUrl


In [None]:
def get_page_content(page_url):
    try:
        response = requests.get(page_url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f'An error occurred while requesting the page: {e}')
        return None

    page_doc = BeautifulSoup(response.text, 'html.parser')
    return page_doc


In [None]:
def get_topic_title(doc):
    topic_title_tag = doc.find_all(
        'p', {'class': 'f3 lh-condensed mb-0 mt-1 Link--primary'})
    if not topic_title_tag:
        raise Exception('Could not find topic title tags')
    topic_title = []
    for item in topic_title_tag:
        topic_title.append(item.text.strip())
    return topic_title


def get_topic_desc(doc):
    topic_desc_tag = doc.find_all(
        'p', {'class': 'f5 color-fg-muted mb-0 mt-1'})
    if not topic_desc_tag:
        raise Exception('Could not find topic description tags')
    topic_desc = []
    for item in topic_desc_tag:
        topic_desc.append(item.text.strip())
    return topic_desc


def get_topic_url(doc):
    topic_url_tag = doc.find_all(
        'a', {'class': 'no-underline flex-1 d-flex flex-column'})
    if not topic_url_tag:
        raise Exception('Could not find topic URL tags')
    base_url = 'https://github.com'
    topic_url = []
    for item in topic_url_tag:
        topic_url.append(base_url + item['href'])
    return topic_url


def scrape_topics_info(page_doc):
    try:
        topics_dict = {
            'topic_title': get_topic_title(page_doc),
            'topic_desc': get_topic_desc(page_doc),
            'topic_url': get_topic_url(page_doc)
        }
    except Exception as e:
        print(f'An error occurred while extracting topic information: {e}')
        return None

    return pd.DataFrame(topics_dict)


def import_to_csv(dataFrame):
    try:
        dataFrame.to_csv('./topics/topics.csv', index=None)
    except Exception as e:
        print(f'An error occurred while writing to the CSV file: {e}')


def update_and_filter_data(new_df):
    try:
        re_index = [*range(100, len(new_df))]
        update_df = new_df.drop(re_index)
    except Exception as e:
        print(f'An error occurred while filtering data: {e}')
        return

    import_to_csv(update_df)


In [None]:
def access_topic_page_url(df, topic_num):
    try:
        return df['topic_url'][topic_num]
    except KeyError:
        print(
            f"Error: The topic_num '{topic_num}' is not found in the dataframe.")
        return None


def parse_star_count(stars_str):
    try:
        stars_str = stars_str.strip()
        if stars_str[-1] == 'k':
            return int(float(stars_str[:-1])*1000)
        else:
            return int(stars_str)
    except ValueError:
        print(f"Error: Unable to parse the string '{stars_str}' as a number.")
        return None


def get_repo_info(repo_tag, star_tag):
    base_url = 'https://github.com'
    try:
        a_tag = repo_tag.find_all('a')
        username = a_tag[0].text.strip()
        repo_name = a_tag[1].text.strip()
        stars = parse_star_count(star_tag.text.strip())
        repo_url = base_url + a_tag[1]['href']
        return username, repo_name, stars, repo_url
    except (AttributeError, IndexError) as e:
        print(f"Error: {str(e)}")
        return None, None, None, None


def get_topic_repos(topic_doc):
    try:
        repo_tag = topic_doc.find_all(
            'h3', {'class': 'f3 color-fg-muted text-normal lh-condensed'})
        star_tag = topic_doc.find_all(
            'span', {'class': 'Counter js-social-count'})
    except AttributeError as e:
        print(f"Error: {str(e)}")
        return None

    topic_repos_dict = {
        'username': [],
        'repo_name': [],
        'stars': [],
        'repo_url': []
    }

    for i in range(len(repo_tag)):
        repo_info = get_repo_info(repo_tag[i], star_tag[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
    return pd.DataFrame(topic_repos_dict)


def import_topic_repos_to_csv(df, topic):
    try:
        df.to_csv(f'./topics/repos/{topic}.csv', index=None)
    except Exception as e:
        print(f"Error: Unable to write to file. {str(e)}")


In [None]:
temp_df = pd.DataFrame()
for page in range(1, 6):
    if len(temp_df) < 100:
        try:
            url = get_pages(page)
            page_doc = get_page_content(url)
            df = scrape_topics_info(page_doc)
            temp_df = temp_df.append(df)
        except Exception as e:
            print(f"Error in page {page}: {e}")
            continue
    else:
        break

try:
    import_to_csv(temp_df)
    new_df = pd.read_csv('./topics/topics.csv')
    update_and_filter_data(new_df)
except Exception as e:
    print(f"Error importing to CSV: {e}")


In [None]:
try:
    new_df = pd.read_csv('./topics/topics.csv')
    for page in range(len(new_df)):
        try:
            url = access_topic_page_url(new_df, page)
            doc = get_page_content(url)
            df = get_topic_repos(doc)
            import_topic_repos_to_csv(df, new_df['topic_title'][page])
        except Exception as e:
            print(f"Error in page {page}: {e}")
            continue
except Exception as e:
    print(f"Error reading from CSV: {e}")
