Minimal code needed for data collection from GitHub API

In [None]:
import os
from dotenv import load_dotenv
from src.mlProject.entity import RepositoriesAnalytics, GitHubClient

load_dotenv()
client = GitHubClient(token=os.getenv("GITHUB_TOKEN"))
analytics = RepositoriesAnalytics(client=client)
analytics.collect_repository_data_for_search(client, query="machine learning", sort="stars")

Minimal code needed for retrieving previously collected repositories from CSV

In [None]:
import os
from dotenv import load_dotenv
from src.mlProject.entity import RepositoriesAnalytics, GitHubClient

load_dotenv()
client = GitHubClient(token=os.getenv("GITHUB_TOKEN"))
analytics_from_csv = RepositoriesAnalytics.from_csv("data", client=client)

Data collection from GitHub API with timing and saving to CSV. Retrieves previously collected repositories.

In [None]:
from src.mlProject.entity import RepositoriesAnalytics, GitHubClient, RepositorySummary
from src.mlProject.constants import REPOSITORY_FEATURES, SUPPORTED_LANGUAGES
from time import time
from dotenv import load_dotenv
import os

load_dotenv()
client = GitHubClient(token=os.getenv("GITHUB_TOKEN"))
analytics = RepositoriesAnalytics.from_csv(folder_path="data", client=client)
queries = [ "machine learning", "gaming", "biology", "cybersecurity", "web development", "data science", "mobile development", "devops", "blockchain", "internet of things" ]

start = time()
processed_count = 0
for query in queries[:2]:  # Limiting to first 2 queries for testing
    # Collect repositories sorted by stars
    processed_count += analytics.collect_repository_data_for_search(
        client, 
        query=query, 
        sort="stars",
        n_releases=12)
    # Collect repositories sorted by best matches
    processed_count += analytics.collect_repository_data_for_search(
        client, 
        query=query,
        n_releases=12)
end = time()

print(f"Temps écoulé pour collecter {processed_count} repositories : {int((end - start)//60)} minutes")

analytics.to_csv("data")

[2026-01-12 19:37:17,812:INFO: utils: Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.]
[2026-01-12 19:37:17,813:INFO: utils: NumExpr defaulting to 16 threads.]


Processing Repositories:   0%|          | 0/35 [00:00<?, ?it/s]

Temps écoulé pour collecter 18 repositories : 8 minutes


Here's how to test how long it takes to initialize the RepositoriesAnalytics from a CSV file

In [2]:
from src.mlProject.entity import RepositoriesAnalytics, GitHubClient
from time import time
from dotenv import load_dotenv
import os

load_dotenv()
client = GitHubClient(token=os.getenv("GITHUB_TOKEN"))
start = time()
analytics_from_csv = RepositoriesAnalytics.from_csv("data", client=client)
end = time()
print(f"Temps écoulé pour initialiser {len(analytics_from_csv.existing_repositories)} repositories : {end - start} secondes")

Temps écoulé pour initialiser 18 repositories : 0.08975458145141602 secondes
