In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval

In [3]:
df = pd.read_parquet('issue/event_study_panel.parquet')
repo_list = df[df['treated']]['repo_name'].unique().tolist()
# ultimately repo_filter should be made all repos
repo_filter = [("repo_name", "in", repo_list)]

In [157]:
def GetNumberOrNull(link):
    try:
        return int(link.split("/")[-1].split("#")[0].split("%")[0].split("?")[0])
    except:
        return np.nan
def GetRepoNumber(link):
    repo_disc_number = "/".join(link.split("/")[-4:-2] + [link.split("/")[-1]]).split("#")[0].split("%")[0].split("?")[0]
    try:
        int(repo_disc_number.split("/")[-1])
        return repo_disc_number
    except:
        return np.nan
def RemoveNA(list):
    return [val for val in list if not pd.isnull(val)]

In [109]:
df_issue_linked_raw = pd.read_parquet('drive/output/scrape/link_issue_pull_request/linked_issue_to_pull_request_new.parquet', filters = repo_filter)

def ExtractSameRepoLinks(github_links, pr_links, repo_name, linking_number):
    disc_numbers = set(pr_links)
    return RemoveNA(list({GetNumberOrNull(link) for link in github_links
                          if repo_name in link and GetNumberOrNull(link) not in disc_numbers and GetNumberOrNull(link) != linking_number}))
def ExtractOtherRepoLinks(github_links, repo_name):
    return RemoveNA(list({GetRepoNumber(link) for link in github_links if repo_name.split("/")[-1] not in link}))
    
def CleanLinkedIssue(df_issue_linked):
    df_issue_linked['github_links'] = df_issue_linked['linked_pull_request'].apply(lambda x: x['github_links'])
    df_issue_linked['pr_links'] = df_issue_linked['linked_pull_request'].apply(lambda x: x['pr_links'])
    df_issue_linked.drop('linked_pull_request', axis=1, inplace=True)

    # possible to have multiple linked PRs
    df_issue_linked['linked_pr'] = df_issue_linked['pr_links'].apply(lambda links: RemoveNA([GetNumberOrNull(link) for link in links]))
    df_issue_linked['same_repo'] = df_issue_linked.apply(lambda row: ExtractSameRepoLinks(row['github_links'], row['linked_pr'], row['repo_name'], row['issue_number']), axis=1)
    df_issue_linked['other_repo'] = df_issue_linked.apply(lambda row: ExtractOtherRepoLinks(row['github_links'], row['repo_name']), axis=1)

    return df_issue_linked.drop(columns = ['github_links','pr_links'])
df_issue_linked = CleanLinkedIssue(df_issue_linked_raw)
print(f"% linked pr, {(100*df_issue_linked['linked_pr'].apply(lambda x: len(x)>0).mean()):2f}")
print(f"% same repo not linked pr, {(100*df_issue_linked['same_repo'].apply(lambda x: len(x)>0).mean()):2f}")
print(f"% other repo discussion, {(100*df_issue_linked['other_repo'].apply(lambda x: len(x)>0).mean()):2f}")

% linked pr, 13.343554
% same repo not linked pr, 11.876934
% other repo discussion, 2.837078


In [116]:
df_pr_linked_raw = pd.read_parquet('drive/output/scrape/link_issue_pull_request/linked_pull_request_to_issue_new.parquet')
df_pr_linked_raw


def CleanLinkedPR(df_pr_linked):
    df_pr_linked['linked_issue'] = df_pr_linked_raw['issue_link'].apply(lambda links: RemoveNA([GetNumberOrNull(link) for link in links]))
    df_pr_linked['same_repo'] = df_pr_linked.apply(lambda row: ExtractSameRepoLinks(row['other_links'], row['linked_issue'], row['repo_name'], row['pr_number']), axis=1)
    df_pr_linked['other_repo'] = df_pr_linked.apply(lambda row: ExtractOtherRepoLinks(row['other_links'], row['repo_name']), axis=1)

    return df_pr_linked.drop(columns=['issue_link','other_links','pull_request_title','pull_request_text'])

df_pr_linked = CleanLinkedPR(df_pr_linked_raw)
print(f"% linked issue, {(100*df_pr_linked['linked_issue'].apply(lambda x: len(x)>0).mean()):2f}")
print(f"% same repo not linked issue, {(100*df_pr_linked['same_repo'].apply(lambda x: len(x)>0).mean()):2f}")
print(f"% other repo discussion, {(100*df_pr_linked['other_repo'].apply(lambda x: len(x)>0).mean()):2f}")

% linked issue, 11.107339
% same repo not linked issue, 46.632518
% other repo discussion, 7.808827


In [None]:
# use issue and pr comments to try and get linked issues/PRs
# can use rules below or further thoughts to integrate linked issues/PRs

In [164]:
# use pr_link to determine match 
# if smaller, improves on
# if other repo, easy
# if bigger, then add to a list of mentioned

# for PR
# if bigger, improved by
# if other repo, easy
# if smaller, then add to a list of mentioned

# Also add ones mentioned in text and apply rules above
# try different rules progressively below such as 
# only using the last (or last 2 comments) for inferring other closing
# seeing if it'smentioned in both
# picking the closest one that's an unmatched issue/PR
# remaining are moved to improved by or improves on


In [141]:
df_linked = pd.merge(df_issue_linked, df_pr_linked, how = 'outer', left_on = ['repo_name','issue_number','linked_pr_number'], 
                     right_on = ['repo_name', 'linked_issue_number', 'pr_number'])
df_linked['issue_number'] = df_linked['issue_number'].fillna(df_linked['linked_issue_number'])
df_linked['pr_number'] = df_linked['pr_number'].fillna(df_linked['linked_pr_number'])
df_linked[['repo_name','issue_number','pr_number','improves_on','improved_by','related_from_other_repo_issue','related_from_other_repo_pr','pull_request_title','pull_request_text']]

Unnamed: 0,repo_name,issue_number,pr_number,improves_on,improved_by,related_from_other_repo_issue,related_from_other_repo_pr,pull_request_title,pull_request_text
0,openshift/openshift-restclient-python,433.0,436.0,,,,,Force update to ver 3.2.2,\nThis PR is to try fixing the requests-oauthl...
1,openshift/openshift-restclient-python,439.0,,,,,,,
2,openshift/openshift-restclient-python,437.0,438.0,,,,,fix split call to handle apis of the format a/b/c,\n\n No description provided. \n\n
3,openshift/openshift-restclient-python,209.0,240.0,,,,,,
4,openshift/openshift-restclient-python,205.0,,,,,,,
...,...,...,...,...,...,...,...,...,...
380410,arrow-py/arrow,,955.0,,,,,Add Latin locale,\nPull Request Checklist\nThank you for taking...
380411,arrow-py/arrow,,959.0,,,,,Add Malay locale,\nPull Request Checklist\nThank you for taking...
380412,arrow-py/arrow,,966.0,,,,,Add Odia (or) locale,\nPull Request Checklist\nThank you for taking...
380413,arrow-py/arrow,,951.0,,,,,Add Latvian locale,\nPull Request Checklist\nThank you for taking...


In [None]:
### ISSUE STUFF

In [124]:
df_issue_raw = pd.read_parquet('drive/output/derived/data_export/df_issue.parquet', filters = repo_filter)
df_pr  = pd.read_parquet('drive/output/derived/data_export/df_pr.parquet', filters = repo_filter)

In [197]:
pr_index = df_pr[['repo_name','pr_number']].drop_duplicates().set_index(['repo_name','pr_number']).index
df_issue = df_issue_raw.loc[~df_issue_raw.set_index(['repo_name','issue_number']).index.isin(pr_index)]

In [None]:
%%time
fenced_code_block_pattern = r'```[\s\S]*?```'  # Matches multiline fenced code blocks (```...```)
github_url_pattern = re.compile(r'https:\/\/github\.com\/\S+')
path_ref_pattern = re.compile(r'([a-zA-Z0-9_.-]*[a-zA-Z][a-zA-Z0-9_.-]*/[a-zA-Z0-9_.-]*[a-zA-Z][a-zA-Z0-9_.-]*)#(\d+)\w*')
plain_ref_pattern = re.compile(r' #\d+')  # e.g. " #123"

def ExtractGithubUrls(text):
    return [] if pd.isnull(text) else github_url_pattern.findall(re.sub(fenced_code_block_pattern, '', text))

def ExtractIssuePrRefs(text):
    if pd.isnull(text): return []
    cleaned = re.sub(fenced_code_block_pattern, '', text)
    path_matches = [f'{m[0]}#{m[1]}' for m in path_ref_pattern.findall(cleaned)]
    plain_matches = [m.strip() for m in plain_ref_pattern.findall(cleaned)]

    return list(set(path_matches + plain_matches))
    
def HasNumericPathComponent(url):
    return not pd.isnull(GetNumberOrNull(url))

def ExtractRelevantGithubUrls(df):
    result = []
    for r in df.itertuples():
        combined = ' '.join(str(x) for x in (r.issue_title, r.issue_body, r.issue_comment_body) if pd.notnull(x))
        cleaned = re.sub(fenced_code_block_pattern, '', combined)
        urls = {url for url in github_url_pattern.findall(cleaned) if HasNumericPathComponent(url)}
        result.append(list(urls))
    return result

def ExtractAllIssuePrRefs(df):
    result = []
    for r in df.itertuples():
        combined = ' '.join(str(x) for x in (r.issue_title, r.issue_body, r.issue_comment_body) if pd.notnull(x))
        cleaned = re.sub(fenced_code_block_pattern, '', combined)
        path_matches = {f'{m[0]}#{m[1]}' for m in path_ref_pattern.findall(cleaned)}
        plain_matches = {m.strip() for m in plain_ref_pattern.findall(cleaned)}
        result.append(list(path_matches | plain_matches))
    return result


df_issue['github_urls'] = ExtractRelevantGithubUrls(df_issue)
df_issue['issue_pr_refs'] = ExtractAllIssuePrRefs(df_issue)
df_pr['github_urls'] = ExtractRelevantGithubUrls(df_pr)
df_pr['issue_pr_refs'] = ExtractAllIssuePrRefs(df_pr)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
df_issue_text_ref = df_issue.groupby(['repo_name','issue_number'])[['github_urls','issue_pr_refs']].sum().reset_index()
df_pr_text_ref = df_pr.groupby(['repo_name','issue_number'])[['github_urls','issue_pr_refs']].sum().reset_index()

Unnamed: 0,repo_name,issue_number,github_urls,issue_pr_refs
0,AnalogJ/lexicon,49.0,[],[]
1,AnalogJ/lexicon,67.0,[],[]
2,AnalogJ/lexicon,103.0,[],[]
3,AnalogJ/lexicon,115.0,[],[#130]
4,AnalogJ/lexicon,186.0,[],[]
...,...,...,...,...
9523,zalando/connexion,1134.0,[],[]
9524,zalando/connexion,1247.0,[],[]
9525,zalando/connexion,1333.0,[],[]
9526,zalando/connexion,1407.0,[],[]


In [146]:
content_references[content_references['github_urls'].apply(len)!=0]

Unnamed: 0,repo_name,issue_number,github_urls,issue_pr_refs
13,AnalogJ/lexicon,102.0,[https://github.com/benkonrath/transip-api/iss...,[]
26,AnalogJ/lexicon,182.0,[https://github.com/certbot/certbot/issues/5735],"[#190, #183, #200]"
43,AnalogJ/lexicon,244.0,[https://github.com/AnalogJ/lexicon/issues/262],[#182]
47,AnalogJ/lexicon,281.0,[https://github.com/AnalogJ/lexicon/issues/262],[]
57,AnalogJ/lexicon,346.0,[https://github.com/AnalogJ/lexicon/issues/249],[]
...,...,...,...,...
70622,zalando/connexion,1343.0,[https://github.com/p1c2u/openapi-spec-validat...,[]
70627,zalando/connexion,1355.0,[https://github.com/zalando/connexion/issues/1...,[]
70639,zalando/connexion,1396.0,[https://github.com/swagger-api/swagger-ui/iss...,[]
70642,zalando/connexion,1408.0,[https://github.com/zalando/connexion/issues/1...,[]


In [148]:
merged_by_issue = df_linked[['repo_name','issue_number','pr_number','improves_on','improved_by',
                             'related_from_other_repo_issue','related_from_other_repo_pr','pull_request_title','pull_request_text']].merge(
    content_references, on=['repo_name', 'issue_number'], how='left', indicator='merge_issue')

to_merge_by_pr = (merged_by_issue.query("merge_issue == 'left_only'").drop(columns=['merge_issue'] + [c for c in content_references.columns if c not in ['repo_name', 'pr_number']]))

merged_by_pr = to_merge_by_pr.merge(content_references, left_on=['repo_name', 'pr_number'], right_on=['repo_name', 'issue_number'], how='left', indicator='merge_pr')

matched_on_issue = merged_by_issue.query("merge_issue == 'both'").drop(columns='merge_issue')
matched_on_pr = merged_by_pr.drop(columns='merge_pr')

df_merged = pd.concat([matched_on_issue, matched_on_pr], ignore_index=True)

In [156]:
df_merged['related_from_other_repo'] = df_merged[['related_from_other_repo_issue', 'related_from_other_repo_pr']].agg(lambda x: x.dropna().tolist() or np.nan, axis=1)
df_merged.drop(['related_from_other_repo_issue','related_from_other_repo_pr'], axis = 1, inplace = True)

In [153]:
for each entry in column github_urls (each entry is a list)
- if list_item.split("/")[-4:-2] == repo_name
   - then if issue_number col is not NA 
     - then if int(list_item.split("/")[-1]) > issue_number, then turn what's in pr_number into a list  and add int(list_item.split("/")[-1]) and remove np.nan
     - then if int(list_item.split("/")[-1]) < issue_number, then turn what's in improves_on into a list and add int(list_item.split("/")[-1]) and remove np.nan

`
- if list_item.split("/")[-4:-2] != repo_name
   - then if issue_number col is not NA 

SyntaxError: unterminated string literal (detected at line 6) (1290577764.py, line 6)

In [158]:
df_merged

Unnamed: 0,repo_name,issue_number,pr_number,improves_on,improved_by,pull_request_title,pull_request_text,github_urls,issue_pr_refs,related_from_other_repo
0,openshift/openshift-restclient-python,205.0,,,,,,[https://github.com/ansible/ansible/issues/44914],[],
1,openshift/openshift-restclient-python,198.0,,1.0,,,,[],[#1],
2,openshift/openshift-restclient-python,188.0,,,,,,[https://github.com/fridex/downshift/issues/3],[],
3,openshift/openshift-restclient-python,189.0,,187.0,,,,[],[#187],
4,openshift/openshift-restclient-python,389.0,,,,,,[https://github.com/kubernetes-client/python/i...,[],
...,...,...,...,...,...,...,...,...,...,...
380410,arrow-py/arrow,,955.0,,,Add Latin locale,\nPull Request Checklist\nThank you for taking...,,,
380411,arrow-py/arrow,,959.0,,,Add Malay locale,\nPull Request Checklist\nThank you for taking...,,,
380412,arrow-py/arrow,,966.0,,,Add Odia (or) locale,\nPull Request Checklist\nThank you for taking...,,,
380413,arrow-py/arrow,,951.0,,,Add Latvian locale,\nPull Request Checklist\nThank you for taking...,,,
