In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
import numpy as np

In [2]:
url = "https://github.com/trending"

In [3]:
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

In [4]:
def get_data(data):
    df_data = []

    language_soup = data.find_all("article", {"class": "Box-row"})

    for soup in language_soup:
        repo_name = soup.find("h2", {"class": "h3"})
        author = repo_name.a.span.text.strip()[:-2]
        repo = repo_name.a.text.strip().split(" ")[-1]
        description = soup.find("p", {"class": "col-9 color-fg-muted my-1 pr-4"})
        if description:
            description = description.text.strip()
        programming_language = soup.find("span", {"itemprop": "programmingLanguage"})
        if programming_language:
            programming_language = programming_language.text
        star_fork = soup.find_all("a", {"class": "Link Link--muted d-inline-block mr-3"})
        star = star_fork[0].text.strip()
        fork = star_fork[1].text.strip()
        users = soup.find("span", {"class": "d-inline-block mr-3"}).find_all("a")
        users = ", ".join([user.get("href").split("/")[-1] for user in users])
        stars_today = soup.find("span", {"class": "d-inline-block float-sm-right"}).text.strip()
        if stars_today:
            stars_today = stars_today.strip().split(" ")[0]
        
        df_data.append([author, repo, description, programming_language, star, fork, users, stars_today])
    
    return df_data

In [5]:
# Scraping data of every language
languages = soup.find("div", {"id": "languages-menuitems"}).find_all("a")

data = []
for language in languages:
    programming_language = language.text.strip()
    new_url = language["href"]
    language_url = f"https://github.com{new_url}"

    language_r = requests.get(language_url)
    language_soup = BeautifulSoup(language_r.content, "html.parser")
    
    language_data = get_data(language_soup)
    data.extend(language_data)

In [6]:
df = pd.DataFrame(data, columns=["Author", "Repo Name", "Description", "Programming Language", "Stars", "Fork", "Top Contributors", "Stars Today"])
display(df)
print(df.info())

Unnamed: 0,Author,Repo Name,Description,Programming Language,Stars,Fork,Top Contributors,Stars Today
0,jwasham,coding-interview-university,A complete computer science study plan to beco...,,315614,78548,"jwasham, Anri-Lombard, avizmarlon, esaucedof, ...",335
1,cloudcommunity,Free-Certifications,A curated list of free courses with certificat...,,29885,2193,"karentamrazyan, CloudStudyNet, diogomurta, syk...",66
2,jujumilk3,leaked-system-prompts,Collection of leaked system prompts,,2519,347,"jujumilk3, accupham, OneTop4458, JakubKontra, ...",61
3,khangich,machine-learning-interview,"Machine Learning Interviews from FAANG, Snapch...",,10702,1734,"khangich, liuvivian11, aragorn87, cuongvng, vi...",38
4,weeeBox,mobile-system-design,A simple framework for mobile system design in...,,4666,494,"weeeBox, AndreiVidrasco, husaynhakeem",5
...,...,...,...,...,...,...,...,...
1108,zml,zml,Any model. Any hardware. Zero compromise. Buil...,Zig,2226,80,"gwenzek, steeve, Corendos, hugomano, renerocksai",5
1109,kristoff-it,zine,"Fast, Scalable, Flexible Static Site Generator...",Zig,892,56,"kristoff-it, MFAshby, veggiemonk, Arnau478, kl...",5
1110,hexops,mach,zig game engine & graphics toolkit,Zig,3891,178,"alichraghi, iddev5, InKryption, wrench-bot, Pi...",4
1111,fairyglade,ly,display manager with console UI,Zig,5994,318,"AnErrupTion, nullgemm, stlaz, liweitianux, mor...",6


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1113 entries, 0 to 1112
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Author                1113 non-null   object
 1   Repo Name             1113 non-null   object
 2   Description           1068 non-null   object
 3   Programming Language  1104 non-null   object
 4   Stars                 1113 non-null   object
 5   Fork                  1113 non-null   object
 6   Top Contributors      1113 non-null   object
 7   Stars Today           1113 non-null   object
dtypes: object(8)
memory usage: 69.7+ KB
None


In [7]:
# Clean data
df["Stars"] = df["Stars"].str.replace(",","").astype(int)
df["Fork"] = df["Fork"].str.replace(",","").astype(int)
df["Stars Today"] = df["Stars Today"].str.replace(",","").astype(int)
df = df.fillna(value=np.nan)
df["Programming Language"] = df["Programming Language"].fillna("Unknown Language")
display(df)
print(df.info())

Unnamed: 0,Author,Repo Name,Description,Programming Language,Stars,Fork,Top Contributors,Stars Today
0,jwasham,coding-interview-university,A complete computer science study plan to beco...,Unknown Language,315614,78548,"jwasham, Anri-Lombard, avizmarlon, esaucedof, ...",335
1,cloudcommunity,Free-Certifications,A curated list of free courses with certificat...,Unknown Language,29885,2193,"karentamrazyan, CloudStudyNet, diogomurta, syk...",66
2,jujumilk3,leaked-system-prompts,Collection of leaked system prompts,Unknown Language,2519,347,"jujumilk3, accupham, OneTop4458, JakubKontra, ...",61
3,khangich,machine-learning-interview,"Machine Learning Interviews from FAANG, Snapch...",Unknown Language,10702,1734,"khangich, liuvivian11, aragorn87, cuongvng, vi...",38
4,weeeBox,mobile-system-design,A simple framework for mobile system design in...,Unknown Language,4666,494,"weeeBox, AndreiVidrasco, husaynhakeem",5
...,...,...,...,...,...,...,...,...
1108,zml,zml,Any model. Any hardware. Zero compromise. Buil...,Zig,2226,80,"gwenzek, steeve, Corendos, hugomano, renerocksai",5
1109,kristoff-it,zine,"Fast, Scalable, Flexible Static Site Generator...",Zig,892,56,"kristoff-it, MFAshby, veggiemonk, Arnau478, kl...",5
1110,hexops,mach,zig game engine & graphics toolkit,Zig,3891,178,"alichraghi, iddev5, InKryption, wrench-bot, Pi...",4
1111,fairyglade,ly,display manager with console UI,Zig,5994,318,"AnErrupTion, nullgemm, stlaz, liweitianux, mor...",6


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1113 entries, 0 to 1112
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Author                1113 non-null   object
 1   Repo Name             1113 non-null   object
 2   Description           1068 non-null   object
 3   Programming Language  1113 non-null   object
 4   Stars                 1113 non-null   int64 
 5   Fork                  1113 non-null   int64 
 6   Top Contributors      1113 non-null   object
 7   Stars Today           1113 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 69.7+ KB
None


In [8]:
# Loading dataframe to CSV
df.to_csv("github_table.csv", index=False)

In [9]:
# Connecting and uploading to SQLite database
on = sqlite3.connect("github.db")
engine = create_engine('sqlite:///github.db', echo=False)
df.to_sql('githubtable', con=engine, if_exists='replace', index=False)

1113

In [10]:
con = sqlite3.connect("github.db")
cur = con.cursor()
query = """
        SELECT * 
        FROM githubtable
        """
df = pd.read_sql_query(query, con)
display(df)

Unnamed: 0,Author,Repo Name,Description,Programming Language,Stars,Fork,Top Contributors,Stars Today
0,jwasham,coding-interview-university,A complete computer science study plan to beco...,Unknown Language,315614,78548,"jwasham, Anri-Lombard, avizmarlon, esaucedof, ...",335
1,cloudcommunity,Free-Certifications,A curated list of free courses with certificat...,Unknown Language,29885,2193,"karentamrazyan, CloudStudyNet, diogomurta, syk...",66
2,jujumilk3,leaked-system-prompts,Collection of leaked system prompts,Unknown Language,2519,347,"jujumilk3, accupham, OneTop4458, JakubKontra, ...",61
3,khangich,machine-learning-interview,"Machine Learning Interviews from FAANG, Snapch...",Unknown Language,10702,1734,"khangich, liuvivian11, aragorn87, cuongvng, vi...",38
4,weeeBox,mobile-system-design,A simple framework for mobile system design in...,Unknown Language,4666,494,"weeeBox, AndreiVidrasco, husaynhakeem",5
...,...,...,...,...,...,...,...,...
1108,zml,zml,Any model. Any hardware. Zero compromise. Buil...,Zig,2226,80,"gwenzek, steeve, Corendos, hugomano, renerocksai",5
1109,kristoff-it,zine,"Fast, Scalable, Flexible Static Site Generator...",Zig,892,56,"kristoff-it, MFAshby, veggiemonk, Arnau478, kl...",5
1110,hexops,mach,zig game engine & graphics toolkit,Zig,3891,178,"alichraghi, iddev5, InKryption, wrench-bot, Pi...",4
1111,fairyglade,ly,display manager with console UI,Zig,5994,318,"AnErrupTion, nullgemm, stlaz, liweitianux, mor...",6


In [11]:
con.close()

In [12]:
con = sqlite3.connect("github.db")
cur = con.cursor()

In [13]:
# Top Repo Today
query = """
        SELECT *
        FROM githubtable
        ORDER BY Stars DESC
        LIMIT 10
        """
df = pd.read_sql_query(query, con)
display(df)

Unnamed: 0,Author,Repo Name,Description,Programming Language,Stars,Fork,Top Contributors,Stars Today
0,jwasham,coding-interview-university,A complete computer science study plan to beco...,Unknown Language,315614,78548,"jwasham, Anri-Lombard, avizmarlon, esaucedof, ...",335
1,kamranahmedse,developer-roadmap,"Interactive roadmaps, guides and other educati...",TypeScript,315258,40951,"kamranahmedse, dansholds, arikchakma, github-a...",337
2,ossu,computer-science,🎓 Path to a free self-taught education in Comp...,HTML,178456,22606,"ericdouglas, waciumawanjohi, joshmhanson, mkgh...",162
3,twbs,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",MDX,172233,79069,"mdo, cvrebert, XhmikosR, fat, dependabot",16
4,flutter,flutter,Flutter makes it easy and fast to build beauti...,Dart,169896,28401,"engine-flutter-autoroll, jonahwilliams, abarth...",35
5,jackfrued,Python-100-Days,Python - 100天从新手到大师,Jupyter Notebook,164470,53531,"jackfrued, MathDance, geekya215, jankeromnes, ...",74
6,f,awesome-chatgpt-prompts,This repo includes ChatGPT prompt curation to ...,JavaScript,123371,16518,"f, iuzn, fengkiej, JonathanDn, developer-acc",70
7,angular,angular,Deliver web apps with confidence 🚀,TypeScript,97578,26138,"gkalpak, petebacondarwin, devversion, crisbeto...",34
8,open-webui,open-webui,"User-friendly AI Interface (Supports Ollama, O...",JavaScript,90923,11470,"tjbck, cheahjs, jannikstdl, michaelpoluektov, ...",197
9,neovim,neovim,Vim-fork focused on extensibility and usability,Vim Script,88887,6045,"zeertzjq, justinmk, janlazo, bfredl, ZyX-I",41


In [14]:
# Top Starred Repo Today
query = """
        SELECT *
        FROM githubtable
        ORDER BY "Stars Today" DESC
        LIMIT 10
        """
df = pd.read_sql_query(query, con)
display(df)

Unnamed: 0,Author,Repo Name,Description,Programming Language,Stars,Fork,Top Contributors,Stars Today
0,microsoft,markitdown,Python tool for converting files and office do...,Python,52723,2604,"afourney, gagb, sugatoray, PetrAPConsulting, l...",980
1,microsoft,BitNet,Official inference framework for 1-bit LLMs,C++,16486,1186,"potassiummmm, younesbelkada, tsong-ms, dawnmsg...",730
2,anthropics,courses,Anthropic's educational courses,Jupyter Notebook,10472,952,"Colt, alexalbertt, rainlee, elie, maggie-vo",383
3,kamranahmedse,developer-roadmap,"Interactive roadmaps, guides and other educati...",TypeScript,315258,40951,"kamranahmedse, dansholds, arikchakma, github-a...",337
4,jwasham,coding-interview-university,A complete computer science study plan to beco...,Unknown Language,315614,78548,"jwasham, Anri-Lombard, avizmarlon, esaucedof, ...",335
5,jumpserver,jumpserver,JumpServer is an open-source Privileged Access...,Python,27213,5459,"ibuler, BaiJiangJie, feng626, fit2bot, xuxinwen",285
6,Byaidu,PDFMathTranslate,PDF scientific paper translation with preserve...,Python,21624,1832,"Byaidu, awwaawwa, hellofinch, reycn, tastelike...",272
7,drawdb-io,drawdb,"Free, simple, and intuitive online database di...",JavaScript,27706,1953,"1ilit, dependabot, haecheonlee, FelixZY, picimako",256
8,microsoft,ai-agents-for-beginners,10 Lessons to Get Started Building AI Agents,Jupyter Notebook,16627,4140,"koreyspace, leestott, ShivamGoyal03, jt-216, a...",234
9,open-webui,open-webui,"User-friendly AI Interface (Supports Ollama, O...",JavaScript,90923,11470,"tjbck, cheahjs, jannikstdl, michaelpoluektov, ...",197


In [15]:
# Top Programming Languages
query = """
        SELECT "Programming Language",
            SUM(Stars) AS "Total Stars",
            SUM(Fork) AS "Total Forks"
        FROM githubtable
        GROUP BY "Programming Language"
        ORDER BY "Total Stars" DESC
        LIMIT 10
        """
df = pd.read_sql_query(query, con)
display(df)

Unnamed: 0,Programming Language,Total Stars,Total Forks
0,TypeScript,534627,80027
1,Java,515181,163765
2,Go,482644,55969
3,JavaScript,451351,79680
4,C++,418626,69526
5,Unknown Language,404413,95927
6,Rust,396316,27135
7,Jupyter Notebook,382267,119697
8,HTML,350302,86798
9,MDX,336030,113811


In [None]:
# Closing the connection
con.close()