In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval
from pandarallel import pandarallel
from collections import defaultdict
from source.lib.helpers import *


In [3]:
pandarallel.initialize(progress_bar = True, nb_workers = 8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [4]:
df = pd.read_parquet('issue/event_study_panel.parquet')
repo_list = df[df['treated']]['repo_name'].unique().tolist()
# ultimately repo_filter should be made all repos
repo_filter = [("repo_name", "in", repo_list)]

In [5]:
def GetNumberOrNull(link):
    try:
        return int(link.split("/")[-1].split("#")[0].split("%")[0].split("?")[0])
    except:
        return np.nan
def GetRepoNumber(link):
    repo_disc_number = "/".join(link.split("/")[-4:-2] + [link.split("/")[-1]]).split("#")[0].split("%")[0].split("?")[0]
    try:
        int(repo_disc_number.split("/")[-1])
        return repo_disc_number
    except:
        return np.nan
def RemoveNA(list):
    return [val for val in list if not pd.isnull(val)]

In [6]:
df_issue_linked_raw = pd.read_parquet('drive/output/scrape/link_issue_pull_request/linked_issue_to_pull_request_new.parquet', filters = repo_filter)

def ExtractSameRepoLinks(github_links, pr_links, repo_name, linking_number):
    disc_numbers = set(pr_links)
    return RemoveNA(list({GetNumberOrNull(link) for link in github_links
                          if repo_name in link and GetNumberOrNull(link) not in disc_numbers and GetNumberOrNull(link) != linking_number}))
def ExtractOtherRepoLinks(github_links, repo_name):
    return RemoveNA(list({GetRepoNumber(link) for link in github_links if repo_name.split("/")[-1] not in link}))
    
def CleanLinkedIssue(df_issue_linked):
    df_issue_linked['github_links'] = df_issue_linked['linked_pull_request'].apply(lambda x: x['github_links'])
    df_issue_linked['pr_links'] = df_issue_linked['linked_pull_request'].apply(lambda x: x['pr_links'])
    df_issue_linked.drop('linked_pull_request', axis=1, inplace=True)

    # possible to have multiple linked PRs
    df_issue_linked['linked_pr'] = df_issue_linked['pr_links'].apply(lambda links: RemoveNA([GetNumberOrNull(link) for link in links]))
    df_issue_linked['same_repo'] = df_issue_linked.apply(lambda row: ExtractSameRepoLinks(row['github_links'], row['linked_pr'], row['repo_name'], row['issue_number']), axis=1)
    df_issue_linked['other_repo'] = df_issue_linked.apply(lambda row: ExtractOtherRepoLinks(row['github_links'], row['repo_name']), axis=1)

    return df_issue_linked.drop(columns = ['github_links','pr_links'])
df_issue_linked = CleanLinkedIssue(df_issue_linked_raw)
print(f"% linked pr, {(100*df_issue_linked['linked_pr'].apply(lambda x: len(x)>0).mean()):2f}")
print(f"% same repo not linked pr, {(100*df_issue_linked['same_repo'].apply(lambda x: len(x)>0).mean()):2f}")
print(f"% other repo discussion, {(100*df_issue_linked['other_repo'].apply(lambda x: len(x)>0).mean()):2f}")

% linked pr, 13.343554
% same repo not linked pr, 11.876934
% other repo discussion, 2.837078


In [7]:
df_pr_linked_raw = pd.read_parquet('drive/output/scrape/link_issue_pull_request/linked_pull_request_to_issue_new.parquet')

def CleanLinkedPR(df_pr_linked):
    df_pr_linked['linked_issue'] = df_pr_linked_raw['issue_link'].apply(lambda links: RemoveNA([GetNumberOrNull(link) for link in links]))
    df_pr_linked['same_repo'] = df_pr_linked.apply(lambda row: ExtractSameRepoLinks(row['other_links'], row['linked_issue'], row['repo_name'], row['pr_number']), axis=1)
    df_pr_linked['other_repo'] = df_pr_linked.apply(lambda row: ExtractOtherRepoLinks(row['other_links'], row['repo_name']), axis=1)

    return df_pr_linked.drop(columns=['issue_link','other_links','pull_request_title','pull_request_text'])

df_pr_linked = CleanLinkedPR(df_pr_linked_raw)
print(f"% linked issue, {(100*df_pr_linked['linked_issue'].apply(lambda x: len(x)>0).mean()):2f}")
print(f"% same repo not linked issue, {(100*df_pr_linked['same_repo'].apply(lambda x: len(x)>0).mean()):2f}")
print(f"% other repo discussion, {(100*df_pr_linked['other_repo'].apply(lambda x: len(x)>0).mean()):2f}")

% linked issue, 11.107339
% same repo not linked issue, 46.632518
% other repo discussion, 7.808827


In [8]:
df_issue_raw = pd.read_parquet(
    'drive/output/derived/data_export/df_issue.parquet',
    filters=repo_filter,
    columns=['repo_name', 'issue_number', 'issue_title', 'issue_body', 'issue_comment_body']
)

df_pr = pd.read_parquet(
    'drive/output/derived/data_export/df_pr.parquet',
    filters=repo_filter,
    columns=['repo_name', 'pr_number', 'pr_title', 'pr_body', 'pr_review_body', 'pr_review_comment_body']
)


In [9]:
pr_index = df_pr[['repo_name','pr_number']].drop_duplicates().set_index(['repo_name','pr_number']).index
df_issue = df_issue_raw.loc[~df_issue_raw.set_index(['repo_name','issue_number']).index.isin(pr_index)]
df_pr_comments = df_issue_raw.loc[df_issue_raw.set_index(['repo_name','issue_number']).index.isin(pr_index)]

In [10]:
fenced_code_block_pattern = re.compile(r'```[\s\S]*?```')
github_url_pattern = re.compile(r'https:\/\/github\.com\/([^\/\s]+)\/([^\/\s]+)\/(pull|issues)\/(\d+)')
path_ref_pattern = re.compile(r'([a-zA-Z0-9_.-]*[a-zA-Z][a-zA-Z0-9_.-]*/[a-zA-Z0-9_.-]*[a-zA-Z][a-zA-Z0-9_.-]*)#(\d+)\w*')
plain_ref_pattern = re.compile(r' #\d+\b')

def HasNumericPathComponent(url):
    return bool(re.search(r'/(pull|issues)/\d+', url))

def CleanText(text):
    return '' if not text or pd.isnull(text) else fenced_code_block_pattern.sub('', text)

def GetCleanedCombinedTextFromColumns(row, columns):
    combined = ' '.join(row[col] if col in row and pd.notnull(row[col]) else '' for col in columns)
    return CleanText(combined)


def ExtractRelevantGithubUrls(row, text_columns):
    cleaned = GetCleanedCombinedTextFromColumns(row, text_columns)
    if 'github.com' not in cleaned and ('pull' not in cleaned or 'issues' not in cleaned):
        return []
    return [f"https://github.com/{match.group(1)}/{match.group(2)}/{match.group(3)}/{match.group(4)}"
            for match in github_url_pattern.finditer(cleaned)]

def ExtractIssuePrRefs(row, text_columns):
    cleaned = GetCleanedCombinedTextFromColumns(row, text_columns)
    if '#' not in cleaned:
        return []
    path_matches = {f'{m[0]}#{m[1]}' for m in path_ref_pattern.findall(cleaned)}
    plain_matches = {m.group(0).strip() for m in plain_ref_pattern.finditer(cleaned)}
    return list(path_matches | plain_matches)


In [11]:
def SampleDataFrames(dataframes, fraction=0.1, random_state=1234):
    return [df.sample(frac=fraction, random_state=random_state) for df in dataframes]

df_issue, df_pr, df_pr_comments, df_pr_linked_raw = SampleDataFrames(
    [df_issue, df_pr, df_pr_comments, df_pr_linked_raw],
    fraction=1,
    random_state=1234
)

In [12]:
df_issue['github_urls'] = df_issue.parallel_apply(lambda r: ExtractRelevantGithubUrls(r, ['issue_title', 'issue_body', 'issue_comment_body']), axis=1)
df_issue['issue_pr_refs'] = df_issue.parallel_apply(lambda r: ExtractIssuePrRefs(r, ['issue_title', 'issue_body', 'issue_comment_body']), axis=1)

df_pr['github_urls'] = df_pr.parallel_apply(lambda r: ExtractRelevantGithubUrls(r, ['pr_title', 'pr_body', 'pr_review_body', 'pr_review_comment_body']), axis=1)
df_pr['issue_pr_refs'] = df_pr.parallel_apply(lambda r: ExtractIssuePrRefs(r, ['pr_title', 'pr_body', 'pr_review_body', 'pr_review_comment_body']), axis=1)

df_pr_comments['github_urls'] = df_pr_comments.parallel_apply(lambda r: ExtractRelevantGithubUrls(r, ['issue_title', 'issue_body', 'issue_comment_body']), axis=1)
df_pr_comments['issue_pr_refs'] = df_pr_comments.parallel_apply(lambda r: ExtractIssuePrRefs(r, ['issue_title', 'issue_body', 'issue_comment_body']), axis=1)

df_pr_linked_raw['github_urls'] = df_pr_linked_raw.parallel_apply(lambda r: ExtractRelevantGithubUrls(r, ['pull_request_title', 'pull_request_text']), axis=1)
df_pr_linked_raw['issue_pr_refs'] = df_pr_linked_raw.parallel_apply(lambda r: ExtractIssuePrRefs(r, ['pull_request_title', 'pull_request_text']), axis=1)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=115541), Label(value='0 / 115541')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=115541), Label(value='0 / 115541')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=130494), Label(value='0 / 130494')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=130494), Label(value='0 / 130494')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=71255), Label(value='0 / 71255')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=71255), Label(value='0 / 71255')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=29089), Label(value='0 / 29089')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=29089), Label(value='0 / 29089')))…

In [15]:
import json
with open('drive/output/derived/graph_structure/graph_metrics.json') as f:
    graph_json = json.load(f)

In [18]:
graph_json['colour-science/colour']

{'201701': {'99779.0': {'degree': 6,
   'normalized_degree': 1.0,
   'betweenness': 0.8,
   'pairwise_overlap': {},
   'weighted_pairwise_overlap': {},
   'overall_overlap': 0.0,
   'weighted_overall_overlap': 0.0,
   'individual_coverage': 100.0,
   'individual_coverage_cluster': 100.0,
   'avg_edge_weight': 16.5,
   'se_edge_weight': 7.710382610480495,
   'percentiles': {'10': 4.0, '25': 5.0, '50': 7.0, '75': 21.75, '90': 38.5},
   'imp_to_imp_comm': {'avg_edge_weight': 0,
    'se_edge_weight': 0,
    'percentiles': {'10': 0, '25': 0, '50': 0, '75': 0, '90': 0}},
   'imp_to_imp_edge_weights': {}},
  '2354108.0': {'degree': 3,
   'normalized_degree': 0.5,
   'betweenness': 0.0,
   'avg_edge_weight': 6.5,
   'se_edge_weight': 1.4999999999999998,
   'percentiles': {'10': 5.3, '25': 5.75, '50': 6.5, '75': 7.25, '90': 7.7},
   'imp_to_imp_comm': {'avg_edge_weight': 51.0,
    'se_edge_weight': 0,
    'percentiles': {'10': 51.0,
     '25': 51.0,
     '50': 51.0,
     '75': 51.0,
     '90': 

In [13]:
df_pr_linked_raw
break

SyntaxError: 'break' outside loop (4218946275.py, line 2)

In [None]:
pr_sources = [df_pr, df_pr_comments.rename(columns={'issue_number':'pr_number'}), df_pr_linked_raw]
df_pr_all = pd.concat([df[['repo_name', 'pr_number', 'github_urls', 'issue_pr_refs']] for df in pr_sources],
                      ignore_index=True)

df_issue_all = df_issue[['repo_name', 'issue_number', 'github_urls', 'issue_pr_refs']]

In [None]:
df_issue_text_ref = df_issue_all.groupby(['repo_name','issue_number'])[['github_urls','issue_pr_refs']].agg(RemoveDuplicatesFlattened).reset_index()
df_pr_text_ref = df_pr_all.groupby(['repo_name','pr_number'])[['github_urls','issue_pr_refs']].agg(RemoveDuplicatesFlattened).reset_index()

In [None]:
df_issue_full_links = pd.merge(df_issue_linked, df_issue_text_ref, how = 'outer', on = ['repo_name','issue_number'])
df_pr_full_links = pd.merge(df_pr_linked, df_pr_text_ref, how = 'outer', on = ['repo_name','pr_number'])

In [None]:
def SameRepoCombine(repo, ref):
    return f"{repo}/{ref.lstrip('#')}"
def OtherRepoCombine(ref):
    return ref.replace('#', '/')
    
def NormalizeIssuePrRefs(row):
    repo = row['repo_name']
    refs = row['issue_pr_refs']
    refs_list = row['discussion_reference'] + [SameRepoCombine(repo, ref) if ref.startswith('#') else OtherRepoCombine(ref) for ref in refs]
    return list(set(refs_list))

def NormalizeDiscussionReferences(df):
    for col in ['linked_issue','same_repo','other_repo', 'github_urls', 'issue_pr_refs']:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [])

    df['discussion_reference'] = df['github_urls'].apply(lambda urls: ["/".join(url.split("/")[-4:-2] + [url.split("/")[-1]]) for url in urls])
    df['discussion_reference'] = df.apply(NormalizeIssuePrRefs, axis=1)

    return df.drop(columns=['github_urls', 'issue_pr_refs'])

df_issue_full_links = NormalizeDiscussionReferences(df_issue_full_links)
df_pr_full_links = NormalizeDiscussionReferences(df_pr_full_links)

In [None]:
def MigrateSameRepo(row, linked_col, id_col):
    same_repo_refs = [ref for ref in row['discussion_reference'] if row['repo_name'] in ref]
    existing_ids = set(row[linked_col]) | set(row['same_repo']) | {row[id_col]}
    additions = []
    for ref in same_repo_refs:
        ref_number = int(ref.split("/")[-1])
        if ref_number not in existing_ids:
            additions.append(ref_number)
    return sorted(row['same_repo'] + additions)

def MigrateOtherRepo(row):
    additions = [ref for ref in row['discussion_reference'] if row['repo_name'] not in ref and ref not in row['other_repo']]
    return sorted(row['other_repo'] + additions)


df_issue_full_links['same_repo'] = df_issue_full_links.apply(lambda x: MigrateSameRepo(x, 'linked_pr', 'issue_number'), axis=1)
df_issue_full_links['other_repo'] = df_issue_full_links.apply(MigrateOtherRepo, axis=1)
df_issue_full_links.drop('discussion_reference', axis = 1, inplace = True)
df_pr_full_links['same_repo'] = df_pr_full_links.apply(lambda x: MigrateSameRepo(x, 'linked_issue', 'pr_number'), axis=1)
df_pr_full_links['other_repo'] = df_pr_full_links.apply(MigrateOtherRepo, axis=1)
df_pr_full_links.drop('discussion_reference', axis = 1, inplace = True)
df_pr_full_links = df_pr_full_links[~df_pr_full_links['pr_number'].isna()]

In [None]:
total_issues = df_issue_full_links.shape[0]
linked_issues = df_issue_full_links[df_issue_full_links['linked_pr'].apply(len)>0].shape[0]
referencing_issues = df_issue_full_links[df_issue_full_links['same_repo'].apply(len)>0].shape[0]

print(f"{total_issues} total issues")
print(f"{linked_issues} linked to a PR ({linked_issues/total_issues:.2f})")
print(f"{referencing_issues} reference another discussion in the same repo ({referencing_issues/total_issues:.2f})")

total_prs = df_pr_full_links.shape[0]
linked_prs = df_pr_full_links[df_pr_full_links['linked_issue'].apply(len)>0].shape[0]
referencing_prs = df_pr_full_links[df_pr_full_links['same_repo'].apply(len)>0].shape[0]

print(f"\n{total_prs} total PRs")
print(f"{linked_prs} linked to an issue ({linked_prs/total_prs:.2f})")
print(f"{referencing_prs} reference another discussion in the same repo ({referencing_prs/total_prs:.2f})")

In [None]:
def BuildStrictLinks(df_issue_full_links, df_pr_full_links):
    def FlattenColumnFast(series_of_lists):
        return sorted({x for lst in series_of_lists.dropna() for x in lst if pd.notna(x)})

    def AggregateRefs(row_refs, base_refs):
        if not row_refs:
            return base_refs
        flat = FlattenColumnFast(pd.Series(row_refs))
        return sorted(set(base_refs).union(flat))

    def MakeNumeric(list):
        return [int(ele) for ele in list]
        
    pr_lookup = df_pr_full_links.set_index(['repo_name', 'pr_number'])
    issue_lookup = df_issue_full_links.set_index(['repo_name', 'issue_number'])
    reverse_issue_map = (
        df_pr_full_links
        .explode('linked_issue')
        .dropna(subset=['linked_issue'])
        .assign(linked_issue=lambda df: df['linked_issue'].astype(int))
        .groupby(['repo_name', 'linked_issue'])
        .apply(lambda g: g.to_dict(orient='records'))
        .to_dict()
    )
    
    records = []

    # --- Process Issues ---
    for _, row in df_issue_full_links.sort_values(['repo_name', 'issue_number']).iterrows():
        repo = row['repo_name']
        issue = MakeNumeric([row['issue_number']])[0]
        linked_prs = MakeNumeric(row['linked_pr'])

        record = {
            'repo_name': repo,
            'issues': [issue],
            'prs': linked_prs if linked_prs else [],
            'same_repo': row['same_repo'],
            'other_repo': row['other_repo']
        }

        # Expand from directly linked PRs
        if linked_prs:
            pr_rows = [pr_lookup.loc[(repo, pr)] for pr in linked_prs if (repo, pr) in pr_lookup.index]
            if pr_rows:
                record['same_repo'] = AggregateRefs([r['same_repo'] for r in pr_rows], record['same_repo'])
                record['other_repo'] = AggregateRefs([r['other_repo'] for r in pr_rows], record['other_repo'])

        # Also check reverse linkage: if any PR links to this issue
        reverse_prs_data = reverse_issue_map.get((repo, issue), [])
        if reverse_prs_data:
            reverse_prs = MakeNumeric([r['pr_number'] for r in reverse_prs_data])
            record['prs'] = AggregateRefs([reverse_prs], record['prs'])
            record['same_repo'] = AggregateRefs([r['same_repo'] for r in reverse_prs_data], record['same_repo'])
            record['other_repo'] = AggregateRefs([r['other_repo'] for r in reverse_prs_data], record['other_repo'])
            
        records.append(record)

    # Track known linked issue sets to avoid duplicate linking
    seen_linked_issues = {(tuple(r['issues']), r['repo_name']) for r in records if r['prs']}

    # --- Process PRs ---
    for _, row in df_pr_full_links.sort_values(['repo_name', 'pr_number']).iterrows():
        repo = row['repo_name']
        pr = MakeNumeric([row['pr_number']])
        linked_issues = MakeNumeric(row['linked_issue'])

        if linked_issues:
            key = (tuple(linked_issues), repo)
            if key in seen_linked_issues:
                continue

        record = {
            'repo_name': repo,
            'prs': pr,
            'issues': linked_issues if linked_issues else [],
            'same_repo': row['same_repo'],
            'other_repo': row['other_repo']
        }

        if linked_issues:
            issue_rows = [issue_lookup.loc[(repo, iss)] for iss in linked_issues if (repo, iss) in issue_lookup.index]
            if issue_rows:
                record['same_repo'] = AggregateRefs([r['same_repo'] for r in issue_rows], record['same_repo'])
                record['other_repo'] = AggregateRefs([r['other_repo'] for r in issue_rows], record['other_repo'])
            seen_linked_issues.add(key)

        records.append(record)

    df_strict_links = pd.DataFrame.from_records(records)
    df_strict_links['problem_id'] = df_strict_links.apply(lambda x: f"{x['repo_name']}/{int(min(x['issues']+x['prs']))}", axis = 1)
    df_strict_links['problem_id_num'] = df_strict_links.apply(lambda x: int(min(x['issues']+x['prs'])), axis = 1)
    
    df_strict_links = df_strict_links.groupby(['repo_name','problem_id','problem_id_num'])[['issues', 'prs', 'same_repo','other_repo']].agg(RemoveDuplicatesFlattened).reset_index()
    df_strict_links['type'] = df_strict_links.apply(lambda x: 'linked' if len(x['issues'])>0 and len(x['prs'])>0 else 'unlinked pr' if len(x['prs'])>0 else 'unlinked issue', axis = 1)
    df_strict_links['same_repo'] = df_strict_links.apply(lambda x: [ref for ref in x['same_repo'] if ref not in x['issues'] and ref not in x['prs']], axis = 1)
    
    return df_strict_links.sort_values(['repo_name','problem_id_num']).reset_index(drop = True)

In [None]:
df_strict_links = BuildStrictLinks(df_issue_full_links, df_pr_full_links)

In [None]:
def PrepareMatchingInputs(df_strict_links):
    df_matcheable = df_strict_links.query('type != "linked"').copy()

    def ComputeSameRepoSubset(row):
        val = row['issues'][0] if row['type'] == 'unlinked issue' else row['prs'][0]
        return [x for x in row['same_repo'] if x > val] if row['type'] == 'unlinked issue' else [x for x in row['same_repo'] if x < val]

    df_matcheable['same_repo_subset'] = df_matcheable.apply(ComputeSameRepoSubset, axis=1)
    df_matcheable = df_matcheable[df_matcheable['same_repo_subset'].apply(bool)]

    # index: (repo_name, type, val) → same_repo
    index = {
        (row['repo_name'], row['type'], v): row['same_repo']
        for _, row in df_matcheable.iterrows()
        for v in (row['issues'] if row['type'] == 'unlinked issue' else row['prs'])
    }

    return df_matcheable.reset_index(drop=True), index

def RunMutualLinking(df_matcheable, index, mutual_same_repo):
    df_final_match = []
    seen_indices = set()

    # Precompute: (repo, type) → {val → index}
    valToIndex = defaultdict(dict)
    for idx, row in df_matcheable.iterrows():
        val_list = row['prs'] if row['type'] == 'unlinked pr' else row['issues']
        for val in val_list:
            valToIndex[(row['repo_name'], row['type'])][val] = idx

    for idx, row in df_matcheable.iterrows():
        if idx in seen_indices:
            continue

        repo = row['repo_name']
        typ = row['type']
        is_issue = typ == 'unlinked issue'
        val = row['issues'][0] if is_issue else row['prs'][0]
        opposite_type = 'unlinked pr' if is_issue else 'unlinked issue'

        # Filter same_repo_subset for values with mutual link
        if mutual_same_repo:
            subset = [x for x in row['same_repo_subset'] if val in index.get((repo, opposite_type, x), []) and 
                      x in valToIndex[(repo, opposite_type)]]
        else:
            subset = [x for x in row['same_repo_subset'] if x in valToIndex[(repo, opposite_type)]]

        if subset:
            linked_val = min(subset, key=lambda x: abs(x - val))
            linked_idx = valToIndex[(repo, opposite_type)][linked_val]
            if linked_idx in seen_indices:
                continue

            linked_row = df_matcheable.loc[linked_idx]
            seen_indices.update({idx, linked_idx})

            combined_issues = list(set(row['issues'] + linked_row['issues']))
            combined_prs = list(set(row['prs'] + linked_row['prs']))
            combined_same_repo = list(set(row['same_repo'] + linked_row['same_repo']) - set(combined_issues + combined_prs))
            combined_other_repo = list(set(row['other_repo'] + linked_row['other_repo']))

            df_final_match.append({
                'repo_name': repo,
                'issues': combined_issues,
                'prs': combined_prs,
                'same_repo': combined_same_repo,
                'other_repo': combined_other_repo,
                'type': 'linked'
            })
        else:
            seen_indices.add(idx)
            row_dict = row.drop('same_repo_subset').to_dict()
            df_final_match.append(row_dict)
    
    df_final_match = pd.DataFrame(df_final_match)
    df_final_match['problem_id'] = df_final_match.apply(lambda x: f"{x['repo_name']}/{int(min(x['issues']+x['prs']))}", axis = 1)
    df_final_match['problem_id_num'] = df_final_match.apply(lambda x: int(min(x['issues']+x['prs'])), axis = 1)


    
    # Build a set of problem_ids that were already considered for matching
    matcheable_problem_ids = set(df_matcheable['problem_id'])
    
    # Keep only rows that were NOT in either
    df_remaining = df_strict_links[~df_strict_links['problem_id'].isin(matcheable_problem_ids)]
    df_final_match_links = pd.concat([df_final_match, df_remaining], ignore_index=True)

    return df_final_match_links

In [None]:
%%time
df_matcheable, index = PrepareMatchingInputs(df_strict_links)
df_both_match_links = RunMutualLinking(df_matcheable, index, mutual_same_repo = True)

In [None]:
%%time
df_matcheable, index = PrepareMatchingInputs(df_strict_links)
df_single_match_links = RunMutualLinking(df_matcheable, index, mutual_same_repo = False)

In [None]:
print(pd.concat([df_strict_links[['type']].value_counts(), df_strict_links[['type']].value_counts(normalize = True)], axis = 1).round(2))
print(pd.concat([df_both_match_links[['type']].value_counts(), df_both_match_links[['type']].value_counts(normalize = True)], axis = 1).round(2))
print(pd.concat([df_single_match_links[['type']].value_counts(), df_single_match_links[['type']].value_counts(normalize = True)], axis = 1).round(2))

In [None]:
df_strict_links.to_parquet('issue/matched_problems_strict.parquet')
df_both_match_links.to_parquet('issue/matched_problems_both.parquet')
df_single_match_links.to_parquet('issue/matched_problems_single.parquet')

# Total set of problems consists of
- Unlinked issues (can't trace or doesn't exist)
- Unlinked prs (can't trace or doesn't exist)
- Linked issues-pull requests

# Starting dataset
- Ordered by columns `repo_name`, `problem_number` (problem number is a custom new column cr
- Contains column `type in (unlinked issues, unlinked prs, linked)`
- Has columns `issues`, `prs`
- Has columns `same_repo`, `same_repo`, `other_repo`, `other_repo` 
- We know that the linked issues-pull requests are 100% correct

Now, we want to improve on the set of `Linked issues-pull requests`. This will necessarily decrease the number of unlinked issues and PRs. I don't expect the set of `Linked issues-pull requests` to ever be the whole set of problems because there are PRs that aren't linked to issues, and there are issues that don't require a PR. 
# Improvement 1 (strict, requires same_repo of issue and pr to both reference each other
- If, for an issue, a value is listed in `same_repo` and that values meets the following criteria
    1) exceeds `issue_number`
    2) for that same `repo_name`, in df_pr_full_links it is listed as a `pr_number`
    3) In `df_pr_full_links`, for that `pr_number`, `issue_number` is also mentioned in `same_repo` and that `pr_number` does not have a `linked_issue`
    4) If there are multiple numbers that meet thet criteria, pick the one that is closest
# Improvement 2 (less strict, does not require same_repo of issue and pr to both reference each other)
- If, for an issue, a value is listed in `same_repo` and that values meets the following criteria
    1) exceeds `issue_number`
    2) for that same `repo_name`, in df_pr_full_links it is listed as a `pr_number`
    3) In `df_pr_full_links`, that `pr_number` does not have a `linked_issue`
- OR if for a PR, a value is listed in `same_repo` and that values meets the following criteria
    1) is less than `pr_number`
    2) for that same `repo_name`, in df_pr_full_links it is listed as a `pr_number`
    3) In `df_pr_full_links`, that `pr_number` does not have a `linked_issue`
- FOR BOTH
    4) If there are multiple numbers that meet thet criteria, pick the one that is closest
