In [1]:
import pandas as pd
import numpy as np

# ============================
# 1. Load all required tables
# ============================

pr = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")
repo = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")
comments = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_comments.parquet")
reviews = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_reviews.parquet")
commits = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_commits.parquet")

# ============================
# 2. Attach language to PRs
# ============================

repo_lang = repo[["id", "language"]].rename(columns={"id": "repo_id"})
pr = pr.merge(repo_lang, on="repo_id", how="left")
pr = pr.dropna(subset=["language"])

# ============================
# 3. Clean agent column
# ============================

def convert_agent(x):
    x = str(x).lower().strip()
    return 1 if x in ["true", "t", "yes", "y", "agent", "ai", "1"] else 0

pr["agent_num"] = pr["agent"].apply(convert_agent)
pr["author_type"] = pr["agent_num"].map({1: "agent", 0: "human"})

# ============================
# 4. Merge activity-level metrics
# ============================

# comment count
comment_count = (
    comments.groupby("pr_id").size().reset_index(name="n_comments")
            .rename(columns={"pr_id": "id"})
)

# review count
review_count = (
    reviews.groupby("pr_id").size().reset_index(name="n_reviews")
           .rename(columns={"pr_id": "id"})
)

# commit count
commit_count = (
    commits.groupby("pr_id").size().reset_index(name="n_commits")
           .rename(columns={"pr_id": "id"})
)

pr = (
    pr
    .merge(comment_count, on="id", how="left")
    .merge(review_count, on="id", how="left")
    .merge(commit_count, on="id", how="left")
)

for col in ["n_comments", "n_reviews", "n_commits"]:
    pr[col] = pr[col].fillna(0)

# ============================
# 5. PR size proxy: description length
# ============================

pr["title"] = pr["title"].fillna("")
pr["body"] = pr["body"].fillna("")
pr["desc_len"] = pr["title"].str.len() + pr["body"].str.len()

# ============================
# 6. Determine merge outcome categories
# ============================

def categorize_outcome(row):
    if pd.notnull(row["merged_at"]):
        return "merged"
    elif pd.notnull(row["closed_at"]):
        return "closed_unmerged"
    else:
        return "abandoned"

pr["merge_outcome"] = pr.apply(categorize_outcome, axis=1)

# ============================
# 7. Group statistics by language Ã— agent
# ============================

rq3 = (
    pr.groupby(["language", "author_type"])
      .agg(
          n_prs=("id", "count"),
          merged_rate=("merge_outcome", lambda x: (x == "merged").mean()),
          failure_rate=("merge_outcome", lambda x: (x == "closed_unmerged").mean()),
          abandonment_rate=("merge_outcome", lambda x: (x == "abandoned").mean()),
          avg_desc_len=("desc_len", "mean"),
          avg_commits=("n_commits", "mean"),
          avg_comments=("n_comments", "mean"),
          avg_reviews=("n_reviews", "mean")
      )
      .reset_index()
)

print("\n=== RQ3: Merge Outcomes Across Languages (with PR size/activity controls) ===")
print(rq3.head(40))


=== RQ3: Merge Outcomes Across Languages (with PR size/activity controls) ===
            language author_type  n_prs  merged_rate  failure_rate  \
0      1C Enterprise       human      5     0.400000      0.200000   
1                 AL       human      1     0.000000      0.000000   
2       ActionScript       human      1     0.000000      0.000000   
3        AngelScript       human      5     0.800000      0.200000   
4               Apex       human      2     1.000000      0.000000   
5        AppleScript       human      1     1.000000      0.000000   
6           Assembly       human      3     0.666667      0.333333   
7              Astro       human      3     0.666667      0.333333   
8              Bicep       human     23     0.304348      0.304348   
9                  C       human    259     0.498069      0.320463   
10                C#       human   1985     0.577330      0.310831   
11               C++       human    422     0.495261      0.334123   
12         