Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion docs/quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"import githubcontribs\n",
"\n",
"fetcher = githubcontribs.Fetcher(\"laminlabs\")\n",
"df = fetcher.run(\"lamindb\")\n",
"df = fetcher.fetch_contribs(\"lamindb\")\n",
"df.head()"
]
},
Expand All @@ -33,6 +33,16 @@
"plotter.plot_total_number_by_author_by_type()\n",
"plotter.plot_number_by_month_by_author()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "63424e9a",
"metadata": {},
"outputs": [],
"source": [
"fetcher.fetch_repos(2025)"
]
}
],
"metadata": {
Expand Down
99 changes: 96 additions & 3 deletions githubcontribs/_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import requests
import urllib3
from requests.adapters import HTTPAdapter
from tqdm import tqdm
from urllib3.util.retry import Retry

warnings.filterwarnings("ignore")
Expand Down Expand Up @@ -57,7 +58,7 @@ def __init__(self, org_name: str, token: str = None):
self.session.mount("https://", adapter)
self.session.headers.update(self.headers)

def _fetch_contribs_as_dicts(
def _fetch_contribs_per_repo_as_dicts(
self, repo_name: str, start_date: str | datetime | None = None
) -> tuple[list[dict], list[dict], list[dict]]:
"""Get commits, issues, and PRs for a specific repository since start_date as dicts."""
Expand Down Expand Up @@ -112,7 +113,9 @@ def _fetch_contribs_per_repo(
self, repo_name: str, start_date: str | None = None
) -> pd.DataFrame:
"""Get commits, issues, and PRs for a specific repository since start_date as dataframes."""
commits, issues, prs = self._fetch_contribs_as_dicts(repo_name, start_date)
commits, issues, prs = self._fetch_contribs_per_repo_as_dicts(
repo_name, start_date
)

# Convert start_date for filtering issues and PRs
if isinstance(start_date, str):
Expand Down Expand Up @@ -176,7 +179,7 @@ def _fetch_contribs_per_repo(

return pd.DataFrame(data)

def run(
def fetch_contribs(
self, repo_names: str | list[str], *, start_date: str | None = None
) -> pd.DataFrame:
"""Get commits, issues, and PRs for all or specific repositories since start_date as a dataframe.
Expand All @@ -194,3 +197,93 @@ def run(
contribs = pd.concat([contribs, repo_contribs], ignore_index=True)

return contribs

def fetch_repos(self, year: int) -> list[str]:
"""Get repositories in the organization that had any activity in the specified year."""
active_repos = []
all_repos = []
page = 1
start_date = f"{year}-01-01T00:00:00Z"
end_date = f"{year}-12-31T23:59:59Z"

print(f"Fetching repositories with activity in {year}...")

with tqdm(desc="Fetching repositories", unit="page") as pbar:
while True:
try:
response = self.session.get(
f"{self.base_url}/orgs/{self.org_name}/repos",
params={ # type: ignore
"page": page,
"per_page": 100,
"sort": "updated",
"direction": "desc",
},
)
response.raise_for_status()
repos = response.json()

if not repos:
break

all_repos.extend([repo["name"] for repo in repos])
pbar.update(1)

for repo in repos:
repo_name = repo["name"]

# Check for commits in the specified year
try:
commits_response = self.session.get(
f"{self.base_url}/repos/{self.org_name}/{repo_name}/commits",
params={ # type: ignore
"since": start_date,
"until": end_date,
"per_page": 1,
},
)
if (
commits_response.status_code == 200
and commits_response.json()
):
active_repos.append(repo_name)
print(f"✓ {repo_name} - Active (has commits in {year})")
continue

# If no commits, check for issues/PRs created or updated in that year
issues_response = self.session.get(
f"{self.base_url}/repos/{self.org_name}/{repo_name}/issues",
params={ # type: ignore
"since": start_date,
"per_page": 1,
"state": "all",
},
)
if issues_response.status_code == 200:
issues = issues_response.json()
if issues:
# Verify issue was actually created in the target year
issue_created = datetime.fromisoformat(
issues[0]["created_at"].replace("Z", "+00:00")
)
if issue_created.year == year:
active_repos.append(repo_name)
print(
f"✓ {repo_name} - Active (has issues/PRs in {year})"
)
continue

except requests.exceptions.RequestException as e:
print(
f"Warning: Error checking activity for {repo_name}: {str(e)}"
)

page += 1

except requests.exceptions.RequestException as e:
print(f"Error fetching repositories page {page}: {str(e)}")
break

print(f"\nFound {len(all_repos)} total repositories")
print(f"Found {len(active_repos)} active repositories in {year}")
return active_repos
12 changes: 5 additions & 7 deletions githubcontribs/_plotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def __init__(self, df: pd.DataFrame):
def plot_total_number_by_author_by_type(
self,
top_n: int = 10,
exclude_author: str = "github-actions[bot]",
start_date: str = None,
):
"""Plot total contributions by author, grouped by contribution type.
Expand All @@ -49,14 +48,12 @@ def plot_total_number_by_author_by_type(
x="author",
hue="type",
top_n=top_n,
exclude_author=exclude_author,
start_date=start_date,
)

def plot_number_by_month_by_author(
self,
top_n: int = 10,
exclude_author: str = "github-actions[bot]",
type_filter: str = "pr",
start_date: str = None,
):
Expand All @@ -77,7 +74,6 @@ def plot_number_by_month_by_author(
x="time",
hue="author",
top_n=top_n,
exclude_author=exclude_author,
type_filter=type_filter,
start_date=start_date,
)
Expand All @@ -87,7 +83,7 @@ def _plot_contributions(
x: str = "author",
hue: str = "type",
top_n: int = 10,
exclude_author: str = "github-actions[bot]",
exclude_authors: list[str] = None,
time_aggregation: str = "month",
type_filter: str = None,
start_date: str = None,
Expand All @@ -98,12 +94,14 @@ def _plot_contributions(
x: Variable to plot on x-axis. Options: "author", "time". Defaults to "author".
hue: Variable to use for color grouping. Options: "type", "author". Defaults to "type".
top_n: Number of top items to show (authors or time periods). Defaults to 10.
exclude_author: Author to exclude from the plot. Defaults to "github-actions[bot]".
exclude_authors: Author to exclude from the plot. Defaults to "github-actions[bot]".
time_aggregation: Time aggregation level when x="time". Options: "day", "week", "month", "year". Defaults to "month".
type_filter: Filter to specific contribution type. Options: "commit", "issue", "pr", or None for all types.
start_date: Filter contributions to only include those on or after this date. Format: "YYYY-MM-DD". Defaults to None (no filter).
"""
df = self.df[self.df.author != exclude_author].copy()
if exclude_authors is None:
exclude_authors = ["github-actions[bot]", "invalid-email-address"]
df = self.df[~self.df.author.isin(exclude_authors)].copy()

# Convert date column to datetime
df["date"] = pd.to_datetime(df["date"])
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies = [
"requests",
"dotenv",
"seaborn",
"tqdm",
]

[project.urls]
Expand Down