In [27]:
# Import des bibliothèques nécessaires
import requests
import pandas as pd
import json
from datetime import datetime, timedelta
import os
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import numpy as np
from dataclasses import dataclass


load_dotenv()

True

In [28]:
# Configuration de l'API GitHub
GITHUB_API_URL = "https://api.github.com"

# Optionnel : Token d'authentification pour augmenter les limites de l'API
TOKEN = os.getenv("GITHUB_TOKEN")
headers = {"Authorization": f"token {TOKEN}"}

In [46]:


SUPPORTED_LANGUAGES = [
    "js", "html", "css", "java", "python", "cpp", "rb", "php", "go", "rs",
    "swift", "kt", "ts", "cs", "dart", "lua", "sh", "pl", "r", "scala",
    "hs", "vb", "m", "jl", "fs", "ex", "sql", "h"
]
OPTIONAL_RELEASE_FEATURES = []
FEATURES = ["date"] + SUPPORTED_LANGUAGES + OPTIONAL_RELEASE_FEATURES
REPOSITORY_FEATURES = [
        "name", "file_counts", "release_count", "size", "stars", "forks", 
        "contributor_count", "created_at", "updated_at",
        "total_commits","topics"
    ]


In [30]:
def get_repositories(query="", sort="stars", per_page=30):
    """
    Récupère les repositories depuis l'API GitHub
    
    Args:
        query (str): Terme de recherche (ex: "python", "machine learning")
        sort (str): Tri par 'stars', 'forks', 'updated'
        per_page (int): Nombre de résultats par page (max 100)
    
    Returns:
        list: Liste des repositories
    """
    url = f"{GITHUB_API_URL}/search/repositories"
    params = {
        "q": query if query else "stars:>1000",  # Par défaut, projets avec plus de 1000 étoiles
        "sort": sort,
        "order": "desc",
        "per_page": per_page
    }
    
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()["items"]
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de la requête : {e}")
        return []

In [31]:
repositories = get_repositories(query="machine learning", per_page=50)

In [47]:
@dataclass
class RepositoryStatistics:
    name: str
    file_counts: int
    release_count: int
    size: int
    stars: int
    forks: int
    contributor_count: int
    created_at: str  # Nouvelle colonne
    updated_at: str  # Nouvelle colonne
    # Activité
    total_commits: int
    topics: list  # Changer de int à list pour stocker les topics


    def add_to_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Adds the current dataclass to the last line of the given dataframe.
        Changes are done in place so that we don't necessarily need to use the returned dataframe.

        Args:
            df (pd.DataFrame): The dataframe to which the dataclass will be added

        Returns:
            pd.DataFrame: The updated dataframe
        """
        df.loc[len(df)] = pd.Series(
            self.__dict__
        )
        return df

def create_empty_repository_statistics_dataframe() -> pd.DataFrame:
    """
    Creates an empty DataFrame with the appropriate columns for RepositoryStatistics.

    Returns:
        pd.DataFrame: An empty DataFrame with RepositoryStatistics columns
    """
    return pd.DataFrame(columns=REPOSITORY_FEATURES)




In [33]:
def get_file_count_from_tree(owner, repo_name):
    """
    Récupère le nombre de fichiers via l'API Git Tree
    
    Args:
        owner (str): Propriétaire du repository
        repo_name (str): Nom du repository
    
    Returns:
        int: Nombre total de fichiers
    """
    # D'abord, récupérer la branche par défaut
    repo_url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}"
    
    try:
        repo_response = requests.get(repo_url, headers=headers)
        repo_response.raise_for_status()
        default_branch = repo_response.json()["default_branch"]
        
        # Ensuite, récupérer l'arbre complet
        tree_url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/git/trees/{default_branch}"
        tree_response = requests.get(tree_url, headers=headers, params={"recursive": "1"})
        tree_response.raise_for_status()
        
        tree_data = tree_response.json()
        
        # Compter seulement les fichiers (pas les dossiers)
        file_count = sum(1 for item in tree_data["tree"] if item["type"] == "blob")
        
        return file_count
    
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de la récupération de l'arbre pour {owner}/{repo_name}: {e}")
        return 0

In [34]:
def get_release_count(owner, repo_name):
    """
    Récupère le nombre de releases d'un repository
    
    Args:
        owner (str): Propriétaire du repository
        repo_name (str): Nom du repository
    
    Returns:
        int: Nombre de releases
    """
    url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/releases"
    
    try:
        # Première requête pour obtenir le nombre total
        response = requests.get(url, headers=headers, params={"per_page": 1})
        response.raise_for_status()
        
        # Vérifier s'il y a un en-tête Link pour la pagination
        if 'Link' in response.headers:
            links = response.headers['Link']
            # Extraire le numéro de la dernière page
            for link in links.split(','):
                if 'rel="last"' in link:
                    last_page_url = link.split(';')[0].strip('<> ')
                    last_page = int(last_page_url.split('page=')[-1].split('&')[0])
                    
                    # Récupérer la dernière page pour compter les éléments restants
                    last_response = requests.get(last_page_url, headers=headers)
                    last_response.raise_for_status()
                    last_page_items = len(last_response.json())
                    
                    return (last_page - 1) * 30 + last_page_items  # 30 par défaut par page
        
        # Si pas de pagination, retourner la longueur directe
        return len(response.json())
        
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de la récupération des releases pour {owner}/{repo_name}: {e}")
        return 0

In [35]:
def get_contributor_count(owner, repo_name):
    """
    Récupère le nombre de contributeurs d'un repository
    
    Args:
        owner (str): Propriétaire du repository
        repo_name (str): Nom du repository
    
    Returns:
        int: Nombre de contributeurs
    """
    url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/contributors"
    
    try:
        # Première requête pour obtenir le nombre total
        response = requests.get(url, headers=headers, params={"per_page": 1})
        response.raise_for_status()
        
        # Vérifier s'il y a un en-tête Link pour la pagination
        if 'Link' in response.headers:
            links = response.headers['Link']
            # Extraire le numéro de la dernière page
            for link in links.split(','):
                if 'rel="last"' in link:
                    last_page_url = link.split(';')[0].strip('<> ')
                    last_page = int(last_page_url.split('page=')[-1].split('&')[0])
                    
                    # Récupérer la dernière page pour compter les éléments restants
                    last_response = requests.get(last_page_url, headers=headers)
                    last_response.raise_for_status()
                    last_page_items = len(last_response.json())
                    
                    return (last_page - 1) * 30 + last_page_items  # 30 par défaut par page
        
        # Si pas de pagination, retourner la longueur directe
        return len(response.json())
        
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de la récupération des contributeurs pour {owner}/{repo_name}: {e}")
        return 0

In [36]:
def clean_location_to_country(location):
    """
    Nettoie et normalise une location pour extraire le pays
    
    Args:
        location (str): Location brute de GitHub
    
    Returns:
        str: Pays normalisé
    """
    if not location:
        return ""
    
    location = location.lower().strip()
    
    # Dictionnaire de mapping des locations vers les pays
    country_mapping = {
        "usa": "United States",
        "us": "United States",
        "united states": "United States",
        "america": "United States",
        "uk": "United Kingdom",
        "england": "United Kingdom",
        "britain": "United Kingdom",
        "france": "France",
        "germany": "Germany",
        "deutschland": "Germany",
        "canada": "Canada",
        "japan": "Japan",
        "china": "China",
        "india": "India",
        "brazil": "Brazil",
        "russia": "Russia",
        "australia": "Australia",
        "netherlands": "Netherlands",
        "sweden": "Sweden",
        "norway": "Norway",
        "italy": "Italy",
        "spain": "Spain",
        "south korea": "South Korea",
        "singapore": "Singapore",
        "switzerland": "Switzerland",
    }
    
    # Chercher des mots-clés dans la location
    for key, country in country_mapping.items():
        if key in location:
            return country
    
    # Si pas de match, retourner la location originale (première partie avant la virgule)
    return location.split(',')[0].title()

In [37]:
def get_owner_country(owner):
    """
    Récupère le pays du propriétaire du repository (si disponible)
    
    Args:
        owner (str): Propriétaire du repository
    
    Returns:
        str: Pays du propriétaire ou chaîne vide
    """
    url = f"{GITHUB_API_URL}/users/{owner}"
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        user_data = response.json()
        
        # GitHub stocke parfois la localisation dans le champ "location"
        location = user_data.get("location", "")
        return location if location else ""
        
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de la récupération des infos utilisateur pour {owner}: {e}")
        return ""

In [38]:
def get_issues_stats(owner, repo_name):
    """
    Récupère les statistiques des issues (ouvertes/fermées)
    
    Args:
        owner (str): Propriétaire du repository
        repo_name (str): Nom du repository
    
    Returns:
        dict: Statistiques des issues
    """
    # Issues ouvertes
    open_url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/issues"
    closed_url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/issues"
    
    try:
        # Issues ouvertes
        open_response = requests.get(open_url, headers=headers, 
                                   params={"state": "open", "per_page": 1})
        open_count = len(open_response.json()) if open_response.status_code == 200 else 0
        
        # Issues fermées (approximation)
        closed_response = requests.get(closed_url, headers=headers, 
                                     params={"state": "closed", "per_page": 100})
        closed_count = len(closed_response.json()) if closed_response.status_code == 200 else 0
        
        return {
            "open_issues": open_count,
            "closed_issues": closed_count,
            "total_issues": open_count + closed_count
        }
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de la récupération des issues pour {owner}/{repo_name}: {e}")
        return {"open_issues": 0, "closed_issues": 0, "total_issues": 0}

In [39]:
def get_commit_stats(owner, repo_name):
    """
    Récupère les statistiques de commits
    
    Args:
        owner (str): Propriétaire du repository
        repo_name (str): Nom du repository
    
    Returns:
        dict: Statistiques des commits
    """
    url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/stats/participation"
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            all_commits = data.get("all", [])
            recent_commits = sum(all_commits[-4:])  # 4 dernières semaines
            total_commits = sum(all_commits)
            return {
                "total_commits": total_commits,
                "recent_commits": recent_commits
            }
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de la récupération des commits pour {owner}/{repo_name}: {e}")
    
    return {"total_commits": 0, "recent_commits": 0}

In [40]:
def get_repository_metadata(owner, repo_name):
    """
    Récupère les métadonnées du repository (licence, README, etc.)
    
    Args:
        owner (str): Propriétaire du repository
        repo_name (str): Nom du repository
    
    Returns:
        dict: Métadonnées du repository
    """
    repo_url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}"
    
    try:
        response = requests.get(repo_url, headers=headers)
        response.raise_for_status()
        repo_data = response.json()
        
        return {
            "license": repo_data.get("license", {}).get("name", "") if repo_data.get("license") else "",
            "has_wiki": repo_data.get("has_wiki", False),
            "has_pages": repo_data.get("has_pages", False),
            "has_projects": repo_data.get("has_projects", False),
            "archived": repo_data.get("archived", False),
            "disabled": repo_data.get("disabled", False),
            "private": repo_data.get("private", False),
            "default_branch": repo_data.get("default_branch", ""),
            "topics": repo_data.get("topics", [])
        }
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de la récupération des métadonnées pour {owner}/{repo_name}: {e}")
        return {}

In [16]:
def format_date(date_string):
    """
    Formate une date ISO 8601 en format plus lisible
    
    Args:
        date_string (str): Date au format ISO 8601
    
    Returns:
        str: Date formatée (YYYY-MM-DD)
    """
    if not date_string:
        return ""
    
    try:
        # Convertir la date ISO vers un format plus simple
        date_obj = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
        return date_obj.strftime('%Y-%m-%d')
    except (ValueError, AttributeError):
        return date_string

# Code principal modifié
general = create_empty_repository_statistics_dataframe()

for repo in repositories:
    owner, repo_name = repo["full_name"].split("/")
    
    # Extraire et formater les dates
    created_date = format_date(repo.get("created_at", ""))
    updated_date = format_date(repo.get("updated_at", ""))
    
    stats = RepositoryStatistics(
        name=repo["full_name"],
        file_counts=0,  # À corriger avec la fonction de comptage de fichiers
        release_count=0,  # Utiliser la valeur récupérée
        size=repo["size"],
        stars=repo["stargazers_count"],
        forks=repo["forks_count"],
        countributor_count=0,
        clone_count=0,
        country="",
        created_at=created_date,  # Nouvelle donnée
        updated_at=updated_date   # Nouvelle donnée
    )
    general = stats.add_to_dataframe(general)

general

Unnamed: 0,name,file_counts,release_count,size,stars,forks,countributor_count,clone_count,country,created_at,updated_at
0,tensorflow/tensorflow,0,0,1235373,192765,75051,0,0,,2015-11-07,2025-12-12
1,huggingface/transformers,0,0,428326,153781,31403,0,0,,2018-10-29,2025-12-12
2,microsoft/ML-For-Beginners,0,0,1628865,81656,19165,0,0,,2021-03-03,2025-12-12
3,fighting41love/funNLP,0,0,174188,77796,15093,0,0,,2018-08-21,2025-12-12
4,josephmisiti/awesome-machine-learning,0,0,2679,70952,15198,0,0,,2014-07-15,2025-12-12
5,scikit-learn/scikit-learn,0,0,178601,64272,26506,0,0,,2010-08-17,2025-12-12
6,gradio-app/gradio,0,0,312994,40907,3189,0,0,,2018-12-19,2025-12-12
7,TheAlgorithms/C-Plus-Plus,0,0,140614,33506,7662,0,0,,2016-07-16,2025-12-12
8,lutzroeder/netron,0,0,69879,32004,3049,0,0,,2010-12-26,2025-12-12
9,eriklindernoren/ML-From-Scratch,0,0,553,29843,5064,0,0,,2017-02-05,2025-12-12


In [None]:
import time
from collections import Counter

def collect_extended_repository_data():
    """Collecte des données étendues sur les repositories"""
    
    
    general = create_empty_repository_statistics_dataframe()
    
    for i, repo in enumerate(repositories):
        owner, repo_name = repo["full_name"].split("/")
        
        print(f"Traitement de {repo['full_name']} ({i+1}/{len(repositories)})...")
        
        # Données de base
        file_count = get_file_count_from_tree(owner, repo_name)
        release_count = get_release_count(owner, repo_name)
        contributor_count = get_contributor_count(owner, repo_name)
        

        
        # Issues et commits
        commit_stats = get_commit_stats(owner, repo_name)
        
        # Métadonnées
        metadata = get_repository_metadata(owner, repo_name)
        
        # Pays et dates
        created_date = format_date(repo.get("created_at", ""))
        updated_date = format_date(repo.get("updated_at", ""))
        
        topics_list = metadata.get("topics", [])

        # Créer l'objet de statistiques étendues
        stats = RepositoryStatistics(
            name=repo["full_name"],
            file_counts=file_count,
            release_count=release_count,
            size=repo["size"],
            stars=repo["stargazers_count"],
            forks=repo["forks_count"],
            contributor_count=contributor_count,
            created_at=created_date,
            updated_at=updated_date,
            total_commits=commit_stats["total_commits"],
            topics=topics_list  # Stocker la liste complète des topics
        )
        
        general = stats.add_to_dataframe(general)
        
        # Délai pour éviter les limites de l'API
    
    return general

# Exécution
extended_data = collect_extended_repository_data()
extended_data

Traitement de tensorflow/tensorflow (1/50)...
Traitement de huggingface/transformers (2/50)...
Traitement de microsoft/ML-For-Beginners (3/50)...
Traitement de fighting41love/funNLP (4/50)...
Traitement de josephmisiti/awesome-machine-learning (5/50)...
Traitement de scikit-learn/scikit-learn (6/50)...
Traitement de gradio-app/gradio (7/50)...
Traitement de TheAlgorithms/C-Plus-Plus (8/50)...
Traitement de lutzroeder/netron (9/50)...
Traitement de eriklindernoren/ML-From-Scratch (10/50)...
Traitement de ashishpatel26/500-AI-Machine-learning-Deep-learning-Computer-vision-NLP-Projects-with-code (11/50)...
Traitement de ageron/handson-ml2 (12/50)...
Traitement de ZuzooVn/machine-learning-for-software-engineers (13/50)...
Traitement de Ebazhanov/linkedin-skill-assessments-quizzes (14/50)...
Traitement de eugeneyan/applied-ml (15/50)...
Traitement de shap/shap (16/50)...
Traitement de trekhleb/homemade-machine-learning (17/50)...
Traitement de PaddlePaddle/Paddle (18/50)...
Traitement de lu

Unnamed: 0,name,file_counts,release_count,size,stars,forks,contributor_count,created_at,updated_at,total_commits,topics
0,tensorflow/tensorflow,35350,6541,1235292,192765,75051,12301,2015-11-07,2025-12-12,14636,"[deep-learning, deep-neural-networks, distribu..."
1,huggingface/transformers,5174,6991,428121,153781,31403,13141,2018-10-29,2025-12-12,3894,"[audio, deep-learning, deepseek, gemma, glm, h..."
2,microsoft/ML-For-Beginners,16092,0,1628865,81656,19165,3931,2021-03-03,2025-12-12,159,"[data-science, education, machine-learning, ma..."
3,fighting41love/funNLP,111,0,174188,77796,15093,301,2018-08-21,2025-12-12,0,[]
4,josephmisiti/awesome-machine-learning,10,0,2679,70952,15198,13261,2014-07-15,2025-12-12,84,[]
5,scikit-learn/scikit-learn,1721,1441,178601,64272,26506,12301,2010-08-17,2025-12-12,1231,"[data-analysis, data-science, machine-learning..."
6,gradio-app/gradio,3199,129511,312994,40908,3189,13291,2018-12-19,2025-12-12,882,"[data-analysis, data-science, data-visualizati..."
7,TheAlgorithms/C-Plus-Plus,431,0,140614,33506,7662,9511,2016-07-16,2025-12-12,38,"[algorithm, algorithm-competitions, algorithms..."
8,lutzroeder/netron,252,61,69879,32004,3049,1,2010-12-26,2025-12-12,856,"[ai, coreml, deep-learning, deeplearning, kera..."
9,eriklindernoren/ML-From-Scratch,89,0,553,29843,5064,151,2017-02-05,2025-12-12,0,"[data-mining, data-science, deep-learning, dee..."
