# Most popular programming languages in Bioinformatics and Computational Biology -- Github


In [1]:
import requests
from datetime import datetime
from tabulate import tabulate
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio

## Input

In [2]:
# Primary input
topic='bioinformatics'
github_token = ''


In [3]:
# Secondary input
# Parameters
min_stars = 10
max_stars = 5000 # why? repo with the most stars and associated with programming language is biopython with 4.8k stars
list_years=list(range(2008,2026)) # For each year the 100 most starred repo are retrieved; The limit of results per search is 100; retrieving all results require pagination management
keywords=''


## Output

In [4]:
stats_repo_pl_vs_topic_df_path='../data/programming_language_x_'+topic+'.csv'
stats_repo_topics_vs_topic_df_path='../data/topics_x_'+topic+'.csv'
list_of_repos_path='../data/list_of_repos.csv'

## Get and Parse Data

### Functions

In [5]:


def search_github_repos(keywords, topic,min_stars, max_stars, start_date, end_date, token=None):
    
    # Build the query
    keywords_query = ' '.join(keywords)
    stars_query = f"stars:{min_stars}..{max_stars}"
    date_query = f"pushed:{start_date}..{end_date}"
    topic_query = f"topic:{topic}"
    
    query = f"{keywords_query} {stars_query} {date_query} {topic_query} "
    
    # GitHub API endpoint
    url = "https://api.github.com/search/repositories"
    
    # Headers
    headers = {
        "Accept": "application/vnd.github.v3+json"
    }
    
    if token:
        headers["Authorization"] = f"token {token}"
    
    # Parameters
    params = {
        "q": query,
        "sort": "stars",
        "order": "desc",
        "per_page": 100  # Max results per page
    }
    
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        
        data = response.json()
        
        # Extract repository information
        repos = []
        for item in data.get('items', []):
            repo_info = {
                'name': item['full_name'],
                'stars': int(item['stargazers_count']),
                'created': datetime.strptime(item['created_at'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d'),
                'forks': int(item['forks_count']),
                'topics': item['topics'],
                'language': item['language'],
                #'languages_url': item['languages_url'],
                'selected_year': int(start_date.split('-')[0])

            }
            repos.append(repo_info)
                    
        return repos, data.get('total_count', 0)
    
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return [], 0


### Get Stats From Github

In [6]:
all_selected_repos=[]
list_total_results=[]

for year in list_years:
    
    start_date = str(year)+'-01-01'
    end_date = str(year)+'-12-31'


    print(f"Searching for repositories")
    print(f"Stars range: {min_stars} - {max_stars}")
    print(f"Date range: {start_date} to {end_date}")
    print(f"Topic: {topic}")

    repos, total = search_github_repos(
        keywords=keywords,
        topic=topic,
        min_stars=min_stars,
        max_stars=max_stars,
        start_date=start_date,
        end_date=end_date,
        token=github_token
    )

    print(f"Total: {total}")
    
    list_total_results.append(total)

    all_selected_repos=all_selected_repos+repos


Searching for repositories
Stars range: 10 - 5000
Date range: 2008-01-01 to 2008-12-31
Topic: bioinformatics
Total: 0
Searching for repositories
Stars range: 10 - 5000
Date range: 2009-01-01 to 2009-12-31
Topic: bioinformatics
Total: 0
Searching for repositories
Stars range: 10 - 5000
Date range: 2010-01-01 to 2010-12-31
Topic: bioinformatics
Total: 0
Searching for repositories
Stars range: 10 - 5000
Date range: 2011-01-01 to 2011-12-31
Topic: bioinformatics
Total: 0
Searching for repositories
Stars range: 10 - 5000
Date range: 2012-01-01 to 2012-12-31
Topic: bioinformatics
Total: 0
Searching for repositories
Stars range: 10 - 5000
Date range: 2013-01-01 to 2013-12-31
Topic: bioinformatics
Total: 2
Searching for repositories
Stars range: 10 - 5000
Date range: 2014-01-01 to 2014-12-31
Topic: bioinformatics
Total: 0
Searching for repositories
Stars range: 10 - 5000
Date range: 2015-01-01 to 2015-12-31
Topic: bioinformatics
Total: 0
Searching for repositories
Stars range: 10 - 5000
Date r

In [7]:
print(len(all_selected_repos))
df = pd.DataFrame(all_selected_repos).reset_index(drop=True)
print(df)

797
                                   name  stars     created  forks  \
0                             lh3/fermi     75  2012-01-06     15   
1    mpieva/mapping-iterative-assembler     19  2012-07-02      7   
2                           bio4j/bio4j    120  2011-01-31     19   
3                               lh3/bfc     74  2014-12-30     12   
4                     shenwei356/go4bio     31  2016-08-15      5   
..                                  ...    ...         ...    ...   
792                   theislab/cellrank    411  2020-03-12     50   
793                   stuart-lab/signac    398  2019-05-09    102   
794                   ACEnglish/truvari    393  2018-04-13     58   
795                      brentp/vcfanno    392  2015-04-29     56   
796                        lh3/miniprot    390  2022-08-04     21   

                                                topics language  selected_year  
0          [bioinformatics, denovo-assembly, genomics]        C           2013  
1    

In [8]:
df_na_removed=df.dropna().reset_index(drop=True)

df_na_removed.to_csv(list_of_repos_path,index=False,sep=';') 


### Format stars and forks count per year per language

In [9]:
df_na_removed=df.dropna().reset_index(drop=True)

list_selected_year=list(np.unique(df_na_removed['selected_year']))
list_language=list(np.unique(df_na_removed['language']))

In [10]:
stats_raw=[]

for lang in list_language:

    total_count_start_per_year_per_language=0
    
    for year in list_selected_year:

        count_start_per_year_per_language=df_na_removed[(df_na_removed['selected_year'] == year) & (df_na_removed['language'] == lang)]['stars'].sum()
        count_fork_per_year_per_language=df_na_removed[(df_na_removed['selected_year'] == year) & (df_na_removed['language'] == lang)]['forks'].sum()
        total_count_start_per_year_per_language=total_count_start_per_year_per_language+count_start_per_year_per_language
        stat_info = {
            'year': year,
            'stars': count_start_per_year_per_language,
            'forks':count_fork_per_year_per_language,
            'language': lang
        }
        stats_raw.append(stat_info)
        
df_stats_raw = pd.DataFrame(stats_raw).reset_index(drop=True)
df_stats_raw

Unnamed: 0,year,stars,forks,language
0,2013,0,0,AMPL
1,2016,0,0,AMPL
2,2017,0,0,AMPL
3,2018,0,0,AMPL
4,2019,0,0,AMPL
...,...,...,...,...
490,2021,52,20,wdl
491,2022,0,0,wdl
492,2023,0,0,wdl
493,2024,0,0,wdl


In [11]:
df_stats_raw.to_csv(stats_repo_pl_vs_topic_df_path,index=False,sep=';') 

### Format stars and forks count per year per topic coocurring with bioinformatics

In [12]:

df_na_removed=df.dropna().reset_index(drop=True)

list_selected_year=list(np.unique(df_na_removed['selected_year']))
list_topics = [item for sublist in list(df_na_removed['topics']) for item in sublist]
list_topics=list(np.unique(list_topics))
# remove topic of refence
list_topics.remove(topic)
list_topics.remove('python')

stats_topic_raw=[]

for topic_current in list_topics:
    total_count_start_per_year_per_topic=0
    
    for year in list_selected_year:
        
        list_tf_matching_topic_current = [True if topic_current in current_list_topic else False for current_list_topic in df_na_removed['topics']]

        count_start_per_year_per_topic=df_na_removed[(df_na_removed['selected_year'] == year) & list_tf_matching_topic_current]['stars'].sum()
        count_forks_per_year_per_topic=df_na_removed[(df_na_removed['selected_year'] == year) & list_tf_matching_topic_current]['forks'].sum()
        total_count_start_per_year_per_topic=total_count_start_per_year_per_topic+count_start_per_year_per_topic
        stat_info = {
            'year': year,
            'stars': total_count_start_per_year_per_topic,
            'forks': count_forks_per_year_per_topic,
            'topic': topic_current
        }
        stats_topic_raw.append(stat_info)

df_stats_topic_raw = pd.DataFrame(stats_topic_raw).reset_index(drop=True)
print(df_stats_topic_raw)



       year  stars  forks        topic
0      2013      0      0  1000genomes
1      2016      0      0  1000genomes
2      2017      0      0  1000genomes
3      2018      0      0  1000genomes
4      2019      0      0  1000genomes
...     ...    ...    ...          ...
20565  2021      0      0          zsh
20566  2022      0      0          zsh
20567  2023     33      4          zsh
20568  2024     33      0          zsh
20569  2025     33      0          zsh

[20570 rows x 4 columns]


In [13]:
df_stats_topic_raw.to_csv(stats_repo_topics_vs_topic_df_path,index=False,sep=';') 