In [1]:
import sys
print(sys.executable)

e:\TA\github-analytics-ml\venv\Scripts\python.exe


In [2]:
import requests
import pandas as pd
import numpy as np
import time
import os
from dotenv import load_dotenv

load_dotenv()
TOKEN = os.getenv("GITHUB_TOKEN")
HEADERS = {"Authorization": f"token {TOKEN}"}

r = requests.get("https://api.github.com/user", headers=HEADERS)
print("Login sebagai:", r.json()["login"])

rl = requests.get("https://api.github.com/rate_limit", headers=HEADERS).json()
print(f"Rate limit sisa: {rl['rate']['remaining']} / {rl['rate']['limit']}")

Login sebagai: latifasalsab
Rate limit sisa: 4998 / 5000


In [3]:
df_kaggle = pd.read_csv("../data/kaggle_filtered.csv")
df_sample = df_kaggle.sample(600, random_state=42).reset_index(drop=True)
repo_list = df_sample["full_name"].tolist()

print(f"Total repo: {len(repo_list)}")
print("Contoh:", repo_list[:3])

Total repo: 600
Contoh: ['BradyFU/Awesome-Multimodal-Large-Language-Models', 'huginn/huginn', 'MobSF/Mobile-Security-Framework-MobSF']


In [4]:
def fetch_commit_timeseries(full_name):
    url = f"https://api.github.com/repos/{full_name}/stats/participation"
    r = requests.get(url, headers=HEADERS)
    if r.status_code == 202:
        time.sleep(3)
        r = requests.get(url, headers=HEADERS)
    if r.status_code == 200:
        return r.json().get("all", [])
    return None

def build_features(full_name, commits_per_week, meta_row):
    commits      = np.array(commits_per_week, dtype=float)
    weeks        = np.arange(len(commits))
    mean_c       = commits.mean()
    slope        = np.polyfit(weeks, commits, 1)[0] if commits.sum() > 0 else 0.0
    active_weeks = np.sum(commits > 0)

    return {
        "full_name":             full_name,
        "commit_frequency":      round(mean_c, 4),
        "activity_consistency":  round(commits.std(), 4),
        "commit_trend":          round(float(slope), 6),
        "active_days_ratio":     round(active_weeks / len(commits), 4),
        "peak_to_average_ratio": round(commits.max() / (mean_c + 1e-9), 4),
        "recent_vs_past_ratio":  round(commits[-2:].mean() / (commits[-4:-2].mean() + 1e-9), 4),
        "velocity_stability":    round(commits.std() / (mean_c + 1e-9), 4),
        "has_description":       1 if pd.notna(meta_row.get("description")) else 0,
        "has_license":           1 if pd.notna(meta_row.get("license")) else 0,
        "forks_count":           int(meta_row.get("forks", 0)),
        "open_issues_count":     int(meta_row.get("issues", 0)),
        "stars":                 int(meta_row.get("stars", 0)),
        "commit_count_total":    int(meta_row.get("defaultBranchCommitCount", 0)),
    }

In [5]:
results = []
errors  = []

for i, full_name in enumerate(repo_list):
    try:
        meta    = df_sample[df_sample["full_name"] == full_name].iloc[0].to_dict()
        commits = fetch_commit_timeseries(full_name)

        if commits and len(commits) == 52:
            results.append(build_features(full_name, commits, meta))
        else:
            errors.append(full_name)

    except Exception as e:
        errors.append(f"{full_name} → {e}")

    if (i + 1) % 50 == 0:
        print(f"[{i+1}/600] ✅ Berhasil: {len(results)} | ❌ Gagal: {len(errors)}")

    time.sleep(0.8)

print(f"\nSelesai! ✅ {len(results)} berhasil | ❌ {len(errors)} gagal")

[50/600] ✅ Berhasil: 50 | ❌ Gagal: 0
[100/600] ✅ Berhasil: 100 | ❌ Gagal: 0
[150/600] ✅ Berhasil: 150 | ❌ Gagal: 0
[200/600] ✅ Berhasil: 200 | ❌ Gagal: 0
[250/600] ✅ Berhasil: 250 | ❌ Gagal: 0
[300/600] ✅ Berhasil: 300 | ❌ Gagal: 0
[350/600] ✅ Berhasil: 350 | ❌ Gagal: 0
[400/600] ✅ Berhasil: 400 | ❌ Gagal: 0
[450/600] ✅ Berhasil: 450 | ❌ Gagal: 0
[500/600] ✅ Berhasil: 499 | ❌ Gagal: 1
[550/600] ✅ Berhasil: 549 | ❌ Gagal: 1
[600/600] ✅ Berhasil: 599 | ❌ Gagal: 1

Selesai! ✅ 599 berhasil | ❌ 1 gagal


In [6]:
df_raw = pd.DataFrame(results)
df_raw.to_csv("../data/github_raw_features.csv", index=False)
print(f"Tersimpan! Shape: {df_raw.shape}")
df_raw.head()

Tersimpan! Shape: (599, 14)


Unnamed: 0,full_name,commit_frequency,activity_consistency,commit_trend,active_days_ratio,peak_to_average_ratio,recent_vs_past_ratio,velocity_stability,has_description,has_license,forks_count,open_issues_count,stars,commit_count_total
0,BradyFU/Awesome-Multimodal-Large-Language-Models,2.0192,4.2403,-0.053061,0.6346,11.8857,2.0,2.1,1,0,1050,148,16144,859
1,huginn/huginn,0.5769,1.498,0.019551,0.2115,12.1333,0.1667,2.5966,1,1,4058,2170,47164,3819
2,MobSF/Mobile-Security-Framework-MobSF,0.4038,0.9043,-0.005592,0.25,12.381,0.0,2.2391,1,1,3462,1534,19340,2015
3,StevenBlack/hosts,6.1154,3.5608,0.049774,0.9808,2.2893,1.5833,0.5823,1,1,2341,2115,28772,4222
4,uber-go/zap,0.1346,0.4398,0.003031,0.0962,14.8571,0.0,3.267,1,1,1484,651,23580,698
