# Most popular programming languages in Bioinformatics and Computational Biology -- Github


In [None]:
import requests
from datetime import datetime
from tabulate import tabulate
import pandas as pd
import plotly.express as px
import numpy as np

## Input

In [None]:
# Primary input
topic='bioinformatics'
github_token = ''


In [None]:
# Secondary input
# Parameters
min_stars = 10
max_stars = 5000 # why? repo with the most stars and associated with programming language is biopython with 4.8k stars
list_years=list(range(2008,2026)) # For each year the 100 most starred repo are retrieved; The limit of results per search is 100; retrieving all results require pagination management
keywords=''


## Output

In [None]:

stats_repo_pl_vs_topic_df='./results/programming_language_x_'+topic+'.csv'
bar_chat_pl_vs_topic_video='./results/programming_language_x_'+topic+'.mp4'

stats_repo_topics_vs_topic_df='./results/topics_x_'+topic+'.csv'
bar_chat_topics_vs_topic_video='./results/topics_x_'+topic+'.mp4'


## Analysis

### Functions

In [None]:


def search_github_repos(keywords, topic,min_stars, max_stars, start_date, end_date, token=None):
    
    # Build the query
    keywords_query = ' '.join(keywords)
    stars_query = f"stars:{min_stars}..{max_stars}"
    date_query = f"pushed:{start_date}..{end_date}"
    topic_query = f"topic:{topic}"
    
    query = f"{keywords_query} {stars_query} {date_query} {topic_query} "
    
    # GitHub API endpoint
    url = "https://api.github.com/search/repositories"
    
    # Headers
    headers = {
        "Accept": "application/vnd.github.v3+json"
    }
    
    if token:
        headers["Authorization"] = f"token {token}"
    
    # Parameters
    params = {
        "q": query,
        "sort": "stars",
        "order": "desc",
        "per_page": 100  # Max results per page
    }
    
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        
        data = response.json()
        
        # Extract repository information
        repos = []
        for item in data.get('items', []):
            repo_info = {
                'name': item['full_name'],
                'stars': int(item['stargazers_count']),
                'created': datetime.strptime(item['created_at'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d'),
                'forks': int(item['forks_count']),
                'topics': item['topics'],
                'language': item['language'],
                'languages_url': item['languages_url'],
                'selected_year': int(start_date.split('-')[0])

            }
            repos.append(repo_info)
                    
        return repos, data.get('total_count', 0)
    
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return [], 0


### Get Stats From Github

In [None]:
all_selected_repos=[]
list_total_results=[]

for year in list_years:
    
    start_date = str(year)+'-01-01'
    end_date = str(year)+'-12-31'


    print(f"Searching for repositories")
    print(f"Stars range: {min_stars} - {max_stars}")
    print(f"Date range: {start_date} to {end_date}")
    print(f"Topic: {topic}")

    repos, total = search_github_repos(
        keywords=keywords,
        topic=topic,
        min_stars=min_stars,
        max_stars=max_stars,
        start_date=start_date,
        end_date=end_date,
        token=github_token
    )

    print(f"Total: {total}")
    
    list_total_results.append(total)

    all_selected_repos=all_selected_repos+repos


In [None]:
print(len(all_selected_repos))
df = pd.DataFrame(all_selected_repos).reset_index(drop=True)
print(df)

### Parse and format stats


In [None]:
df_na_removed=df.dropna().reset_index(drop=True)

list_selected_year=list(np.unique(df_na_removed['selected_year']))
list_language=list(np.unique(df_na_removed['language']))

stats_raw=[]

for lang in list_language:

    total_count_start_per_year_per_language=0
    
    for year in list_selected_year:

        count_start_per_year_per_language=df_na_removed[(df_na_removed['selected_year'] == year) & (df_na_removed['language'] == lang)]['stars'].sum()
        total_count_start_per_year_per_language=total_count_start_per_year_per_language+count_start_per_year_per_language
        stat_info = {
            'year': year,
            'stars': total_count_start_per_year_per_language,
            'language': lang
        }
        stats_raw.append(stat_info)

df_stats_raw = pd.DataFrame(stats_raw).reset_index(drop=True)
print(df_stats_raw)

df_stats_raw.to_csv(stats_repo_pl_vs_topic_df) 


In [None]:
stats_repo_pl_vs_topic_df

### Compute and Save Vid√©o

In [None]:
import bar_chart_race as bcr
import matplotlib
# Set the path to your ffmpeg executable
# Replace '/usr/bin/ffmpeg' with your actual path, e.g., 'C:\\ffmpeg\\bin\\ffmpeg.exe' on Windows
matplotlib.rcParams['animation.ffmpeg_path'] = '/usr/bin/ffmpeg' 

df_stats_raw = df_stats_raw.sort_values(by='year')

df_stats_raw['year_string'] = [datetime.strptime(str(item)+'-01-01', "%Y-%m-%d") for item in list(df_stats_raw['year']) ]
df_wide = df_stats_raw.pivot(index='year_string', columns='language', values='stars')
df_wide.index.name='date'

bcr_fig=bcr.bar_chart_race(df_wide,  
                   filename=bar_chat_pl_vs_topic_video,
                   perpendicular_bar_func='mean', n_bars=10,
                   
                    title={
                        'label':'Most popular (most starred) programming languages in ' + topic+ ' from 2013 to 2025',
                        'size': 12,
                    },
                   shared_fontdict={'family': 'Helvetica', 'weight': 'bold',
                                    'color': 'rebeccapurple'}, 
                   interpolate_period=False,
                    period_template='%Y',
                   period_length=1100,
                fig_kwargs =
                    {
                        'figsize': (10, 5),
                        'dpi': 120,
                    }
                )


## Topics related to topic of refence


In [None]:

df_na_removed=df.dropna().reset_index(drop=True)

list_selected_year=list(np.unique(df_na_removed['selected_year']))
list_topics = [item for sublist in list(df_na_removed['topics']) for item in sublist]
list_topics=list(np.unique(list_topics))
# remove topic of refence
list_topics.remove(topic)
list_topics.remove('python')

stats_topic_raw=[]

for topic_current in list_topics:
    total_count_start_per_year_per_topic=0
    
    for year in list_selected_year:
        
        list_tf_matching_topic_current = [True if topic_current in current_list_topic else False for current_list_topic in df_na_removed['topics']]

        count_start_per_year_per_topic=df_na_removed[(df_na_removed['selected_year'] == year) & list_tf_matching_topic_current]['stars'].sum()
        total_count_start_per_year_per_topic=total_count_start_per_year_per_topic+count_start_per_year_per_topic
        stat_info = {
            'year': year,
            'stars': total_count_start_per_year_per_topic,
            'topic': topic_current
        }
        stats_topic_raw.append(stat_info)

df_stats_topic_raw = pd.DataFrame(stats_topic_raw).reset_index(drop=True)
print(df_stats_topic_raw)

df_stats_topic_raw.to_csv(stats_repo_topics_vs_topic_df) 

In [None]:
import bar_chart_race as bcr
import matplotlib
# Set the path to your ffmpeg executable
# Replace '/usr/bin/ffmpeg' with your actual path, e.g., 'C:\\ffmpeg\\bin\\ffmpeg.exe' on Windows
matplotlib.rcParams['animation.ffmpeg_path'] = '/usr/bin/ffmpeg' 

df_stats_topic_raw = df_stats_topic_raw.sort_values(by='year')

df_stats_topic_raw['year_string'] = [datetime.strptime(str(item)+'-01-01', "%Y-%m-%d") for item in list(df_stats_topic_raw['year']) ]
df_stats_topic_wide = df_stats_topic_raw.pivot(index='year_string', columns='topic', values='stars')
df_stats_topic_wide.index.name='date'

bcr_fig=bcr.bar_chart_race(df_stats_topic_wide,  
                   filename=bar_chat_topics_vs_topic_video,
                   perpendicular_bar_func='mean', n_bars=10,                   
                    title={
                        'label':'Topics strongly associated with ' + topic + ' from 2008 to 2025',
                        'size': 12,
                    },
                   shared_fontdict={'family': 'Helvetica', 'weight': 'bold',
                                    'color': 'rebeccapurple'}, 
                   interpolate_period=False,
                    period_template='%Y',
                   period_length=1100,
                )

