In [1]:
import os
os.chdir('../')

In [3]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval
from pandarallel import pandarallel
from collections import defaultdict
#from source.lib.helpers import *

In [4]:
pandarallel.initialize(progress_bar = True, nb_workers = 8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [5]:
repo_list = []
repo_filter = [("repo_name", "in", repo_list)]

% linked pr, 13.343554
% same repo not linked pr, 11.876934
% other repo discussion, 2.837078


% linked issue, 11.107339
% same repo not linked issue, 46.632518
% other repo discussion, 7.808827


In [18]:
total_issues = df_issue_full_links.shape[0]
linked_issues = df_issue_full_links[df_issue_full_links['linked_pr'].apply(len)>0].shape[0]
referencing_issues = df_issue_full_links[df_issue_full_links['same_repo'].apply(len)>0].shape[0]

print(f"{total_issues} total issues")
print(f"{linked_issues} linked to a PR ({linked_issues/total_issues:.2f})")
print(f"{referencing_issues} reference another discussion in the same repo ({referencing_issues/total_issues:.2f})")

total_prs = df_pr_full_links.shape[0]
linked_prs = df_pr_full_links[df_pr_full_links['linked_issue'].apply(len)>0].shape[0]
referencing_prs = df_pr_full_links[df_pr_full_links['same_repo'].apply(len)>0].shape[0]

print(f"\n{total_prs} total PRs")
print(f"{linked_prs} linked to an issue ({linked_prs/total_prs:.2f})")
print(f"{referencing_prs} reference another discussion in the same repo ({referencing_prs/total_prs:.2f})")

181983 total issues
24283 linked to a PR (0.13)
55322 reference another discussion in the same repo (0.30)

234119 total PRs
25848 linked to an issue (0.11)
119409 reference another discussion in the same repo (0.51)


In [19]:
def BuildStrictLinks(df_issue_full_links, df_pr_full_links):
    def FlattenColumnFast(series_of_lists):
        return sorted({x for lst in series_of_lists.dropna() for x in lst if pd.notna(x)})

    def AggregateRefs(row_refs, base_refs):
        if not row_refs:
            return base_refs
        flat = FlattenColumnFast(pd.Series(row_refs))
        return sorted(set(base_refs).union(flat))

    def MakeNumeric(list):
        return [int(ele) for ele in list]
        
    pr_lookup = df_pr_full_links.set_index(['repo_name', 'pr_number'])
    issue_lookup = df_issue_full_links.set_index(['repo_name', 'issue_number'])
    reverse_issue_map = (
        df_pr_full_links
        .explode('linked_issue')
        .dropna(subset=['linked_issue'])
        .assign(linked_issue=lambda df: df['linked_issue'].astype(int))
        .groupby(['repo_name', 'linked_issue'])
        .apply(lambda g: g.to_dict(orient='records'))
        .to_dict()
    )
    
    records = []

    # --- Process Issues ---
    for _, row in df_issue_full_links.sort_values(['repo_name', 'issue_number']).iterrows():
        repo = row['repo_name']
        issue = MakeNumeric([row['issue_number']])[0]
        linked_prs = MakeNumeric(row['linked_pr'])

        record = {
            'repo_name': repo,
            'issues': [issue],
            'prs': linked_prs if linked_prs else [],
            'same_repo': row['same_repo'],
            'other_repo': row['other_repo']
        }

        # Expand from directly linked PRs
        if linked_prs:
            pr_rows = [pr_lookup.loc[(repo, pr)] for pr in linked_prs if (repo, pr) in pr_lookup.index]
            if pr_rows:
                record['same_repo'] = AggregateRefs([r['same_repo'] for r in pr_rows], record['same_repo'])
                record['other_repo'] = AggregateRefs([r['other_repo'] for r in pr_rows], record['other_repo'])

        # Also check reverse linkage: if any PR links to this issue
        reverse_prs_data = reverse_issue_map.get((repo, issue), [])
        if reverse_prs_data:
            reverse_prs = MakeNumeric([r['pr_number'] for r in reverse_prs_data])
            record['prs'] = AggregateRefs([reverse_prs], record['prs'])
            record['same_repo'] = AggregateRefs([r['same_repo'] for r in reverse_prs_data], record['same_repo'])
            record['other_repo'] = AggregateRefs([r['other_repo'] for r in reverse_prs_data], record['other_repo'])
            
        records.append(record)

    # Track known linked issue sets to avoid duplicate linking
    seen_linked_issues = {(tuple(r['issues']), r['repo_name']) for r in records if r['prs']}

    # --- Process PRs ---
    for _, row in df_pr_full_links.sort_values(['repo_name', 'pr_number']).iterrows():
        repo = row['repo_name']
        pr = MakeNumeric([row['pr_number']])
        linked_issues = MakeNumeric(row['linked_issue'])

        if linked_issues:
            key = (tuple(linked_issues), repo)
            if key in seen_linked_issues:
                continue

        record = {
            'repo_name': repo,
            'prs': pr,
            'issues': linked_issues if linked_issues else [],
            'same_repo': row['same_repo'],
            'other_repo': row['other_repo']
        }

        if linked_issues:
            issue_rows = [issue_lookup.loc[(repo, iss)] for iss in linked_issues if (repo, iss) in issue_lookup.index]
            if issue_rows:
                record['same_repo'] = AggregateRefs([r['same_repo'] for r in issue_rows], record['same_repo'])
                record['other_repo'] = AggregateRefs([r['other_repo'] for r in issue_rows], record['other_repo'])
            seen_linked_issues.add(key)

        records.append(record)

    df_strict_links = pd.DataFrame.from_records(records)
    df_strict_links['problem_id'] = df_strict_links.apply(lambda x: f"{x['repo_name']}/{int(min(x['issues']+x['prs']))}", axis = 1)
    df_strict_links['problem_id_num'] = df_strict_links.apply(lambda x: int(min(x['issues']+x['prs'])), axis = 1)
    
    df_strict_links = df_strict_links.groupby(['repo_name','problem_id','problem_id_num'])[['issues', 'prs', 'same_repo','other_repo']].agg(RemoveDuplicatesFlattened).reset_index()
    df_strict_links['type'] = df_strict_links.apply(lambda x: 'linked' if len(x['issues'])>0 and len(x['prs'])>0 else 'unlinked pr' if len(x['prs'])>0 else 'unlinked issue', axis = 1)
    df_strict_links['same_repo'] = df_strict_links.apply(lambda x: [ref for ref in x['same_repo'] if ref not in x['issues'] and ref not in x['prs']], axis = 1)
    
    return df_strict_links.sort_values(['repo_name','problem_id_num']).reset_index(drop = True)

In [20]:
df_strict_links = BuildStrictLinks(df_issue_full_links, df_pr_full_links)

  .apply(lambda g: g.to_dict(orient='records'))


In [21]:
def PrepareMatchingInputs(df_strict_links):
    df_matcheable = df_strict_links.query('type != "linked"').copy()

    def ComputeSameRepoSubset(row):
        val = row['issues'][0] if row['type'] == 'unlinked issue' else row['prs'][0]
        return [x for x in row['same_repo'] if x > val] if row['type'] == 'unlinked issue' else [x for x in row['same_repo'] if x < val]

    df_matcheable['same_repo_subset'] = df_matcheable.apply(ComputeSameRepoSubset, axis=1)
    df_matcheable = df_matcheable[df_matcheable['same_repo_subset'].apply(bool)]

    # index: (repo_name, type, val) → same_repo
    index = {
        (row['repo_name'], row['type'], v): row['same_repo']
        for _, row in df_matcheable.iterrows()
        for v in (row['issues'] if row['type'] == 'unlinked issue' else row['prs'])
    }

    return df_matcheable.reset_index(drop=True), index

def RunMutualLinking(df_matcheable, index, mutual_same_repo):
    df_final_match = []
    seen_indices = set()

    # Precompute: (repo, type) → {val → index}
    valToIndex = defaultdict(dict)
    for idx, row in df_matcheable.iterrows():
        val_list = row['prs'] if row['type'] == 'unlinked pr' else row['issues']
        for val in val_list:
            valToIndex[(row['repo_name'], row['type'])][val] = idx

    for idx, row in df_matcheable.iterrows():
        if idx in seen_indices:
            continue

        repo = row['repo_name']
        typ = row['type']
        is_issue = typ == 'unlinked issue'
        val = row['issues'][0] if is_issue else row['prs'][0]
        opposite_type = 'unlinked pr' if is_issue else 'unlinked issue'

        # Filter same_repo_subset for values with mutual link
        if mutual_same_repo:
            subset = [x for x in row['same_repo_subset'] if val in index.get((repo, opposite_type, x), []) and 
                      x in valToIndex[(repo, opposite_type)]]
        else:
            subset = [x for x in row['same_repo_subset'] if x in valToIndex[(repo, opposite_type)]]

        if subset:
            linked_val = min(subset, key=lambda x: abs(x - val))
            linked_idx = valToIndex[(repo, opposite_type)][linked_val]
            if linked_idx in seen_indices:
                continue

            linked_row = df_matcheable.loc[linked_idx]
            seen_indices.update({idx, linked_idx})

            combined_issues = list(set(row['issues'] + linked_row['issues']))
            combined_prs = list(set(row['prs'] + linked_row['prs']))
            combined_same_repo = list(set(row['same_repo'] + linked_row['same_repo']) - set(combined_issues + combined_prs))
            combined_other_repo = list(set(row['other_repo'] + linked_row['other_repo']))

            df_final_match.append({
                'repo_name': repo,
                'issues': combined_issues,
                'prs': combined_prs,
                'same_repo': combined_same_repo,
                'other_repo': combined_other_repo,
                'type': 'linked'
            })
        else:
            seen_indices.add(idx)
            row_dict = row.drop('same_repo_subset').to_dict()
            df_final_match.append(row_dict)
    
    df_final_match = pd.DataFrame(df_final_match)
    df_final_match['problem_id'] = df_final_match.apply(lambda x: f"{x['repo_name']}/{int(min(x['issues']+x['prs']))}", axis = 1)
    df_final_match['problem_id_num'] = df_final_match.apply(lambda x: int(min(x['issues']+x['prs'])), axis = 1)


    
    # Build a set of problem_ids that were already considered for matching
    matcheable_problem_ids = set(df_matcheable['problem_id'])
    
    # Keep only rows that were NOT in either
    df_remaining = df_strict_links[~df_strict_links['problem_id'].isin(matcheable_problem_ids)]
    df_final_match_links = pd.concat([df_final_match, df_remaining], ignore_index=True)

    return df_final_match_links

In [22]:
%%time
df_matcheable, index = PrepareMatchingInputs(df_strict_links)
df_both_match_links = RunMutualLinking(df_matcheable, index, mutual_same_repo = True)

CPU times: user 15.8 s, sys: 196 ms, total: 16 s
Wall time: 16.4 s


In [23]:
%%time
df_matcheable, index = PrepareMatchingInputs(df_strict_links)
df_single_match_links = RunMutualLinking(df_matcheable, index, mutual_same_repo = False)

CPU times: user 15.9 s, sys: 262 ms, total: 16.1 s
Wall time: 16.6 s


In [24]:
print(pd.concat([df_strict_links[['type']].value_counts(), df_strict_links[['type']].value_counts(normalize = True)], axis = 1).round(2))
print(pd.concat([df_both_match_links[['type']].value_counts(), df_both_match_links[['type']].value_counts(normalize = True)], axis = 1).round(2))
print(pd.concat([df_single_match_links[['type']].value_counts(), df_single_match_links[['type']].value_counts(normalize = True)], axis = 1).round(2))

                 count  proportion
type                              
unlinked pr     208116        0.53
unlinked issue  155045        0.40
linked           27397        0.07
                 count  proportion
type                              
unlinked pr     194840        0.52
unlinked issue  142851        0.38
linked           38423        0.10
                 count  proportion
type                              
unlinked pr     188507        0.51
unlinked issue  142654        0.39
linked           38712        0.10


In [25]:
df_strict_links.to_parquet('issue/matched_problems_strict.parquet')
df_both_match_links.to_parquet('issue/matched_problems_both.parquet')
df_single_match_links.to_parquet('issue/matched_problems_single.parquet')

In [26]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Total set of problems consists of
- Unlinked issues (can't trace or doesn't exist)
- Unlinked prs (can't trace or doesn't exist)
- Linked issues-pull requests

# Starting dataset
- Ordered by columns `repo_name`, `problem_number` (problem number is a custom new column cr
- Contains column `type in (unlinked issues, unlinked prs, linked)`
- Has columns `issues`, `prs`
- Has columns `same_repo`, `same_repo`, `other_repo`, `other_repo` 
- We know that the linked issues-pull requests are 100% correct

Now, we want to improve on the set of `Linked issues-pull requests`. This will necessarily decrease the number of unlinked issues and PRs. I don't expect the set of `Linked issues-pull requests` to ever be the whole set of problems because there are PRs that aren't linked to issues, and there are issues that don't require a PR. 
# Improvement 1 (strict, requires same_repo of issue and pr to both reference each other
- If, for an issue, a value is listed in `same_repo` and that values meets the following criteria
    1) exceeds `issue_number`
    2) for that same `repo_name`, in df_pr_full_links it is listed as a `pr_number`
    3) In `df_pr_full_links`, for that `pr_number`, `issue_number` is also mentioned in `same_repo` and that `pr_number` does not have a `linked_issue`
    4) If there are multiple numbers that meet thet criteria, pick the one that is closest
# Improvement 2 (less strict, does not require same_repo of issue and pr to both reference each other)
- If, for an issue, a value is listed in `same_repo` and that values meets the following criteria
    1) exceeds `issue_number`
    2) for that same `repo_name`, in df_pr_full_links it is listed as a `pr_number`
    3) In `df_pr_full_links`, that `pr_number` does not have a `linked_issue`
- OR if for a PR, a value is listed in `same_repo` and that values meets the following criteria
    1) is less than `pr_number`
    2) for that same `repo_name`, in df_pr_full_links it is listed as a `pr_number`
    3) In `df_pr_full_links`, that `pr_number` does not have a `linked_issue`
- FOR BOTH
    4) If there are multiple numbers that meet thet criteria, pick the one that is closest
