In [1]:
class Metric(object):
    def __init__(self):
        self.cnt = 0
        self.h_1 = 0
        self.h_3 = 0
        self.h_10 = 0
        self.mr = 0
        self.mrr = 0

    def _normalize(self):
        return self.h_1 / self.cnt, self.h_3 / self.cnt, self.h_10 / self.cnt, self.mr / self.cnt, self.mrr / self.cnt

    def __str__(self):
        h_1, h_3, h_10, mr, mrr = self._normalize()
        return f'\nH@1: {h_1}\nH@3: {h_3}\nH@10: {h_10}\nMR: {mr}\nMRR: {mrr}\n'

    def __iter__(self):
        h_1, h_3, h_10, mr, mrr = self._normalize()
        yield 'metric/H@1', h_1
        yield 'metric/H@3', h_3
        yield 'metric/H@10', h_10
        yield 'metric/MR', mr
        yield 'metric/MRR', mrr

    def update(self, r):
        self.cnt += 1

        if r < 2:
            self.h_1 += 1
        if r < 4:
            self.h_3 += 1
        if r < 11:
            self.h_10 += 1

        self.mr += r
        self.mrr += 1.0 / r

In [11]:
import pandas as pd

tr = pd.read_csv('/Users/kahrabian/projects/KnowledgeGraphEmbedding/data/GitGraph_0.01/train.txt', sep='\t', names=['s', 'r', 'o'])
vd = pd.read_csv('/Users/kahrabian/projects/KnowledgeGraphEmbedding/data/GitGraph_0.01/valid.txt', sep='\t', names=['s', 'r', 'o'])
# ts = pd.read_csv('./data/split/test_s.txt', sep='\t', header=None)

In [12]:
tr.head(n=5)

Unnamed: 0,s,r,o
0,/issue/530835216,I_AO_C_R,/repo/111583593
1,/user/57339170,U_SO_O_P,/pr/353182479
2,/pr/342190922,P_AO_C_R,/repo/97622367
3,/pr/350973833,P_AO_C_R,/repo/134612364
4,/pr/352120599,P_AO_C_R,/repo/121470383


In [6]:
vd.head(n=5)

Unnamed: 0,s,o,r
0,/user/16148461,U_SE_C_I,/issue/539815053
1,/user/5367102,U_SE_C_I,/issue/537820918
2,/user/1215687,U_SE_C_I,/issue/536853169
3,/user/1804755,U_SE_C_I,/issue/544179986
4,/user/1088218,U_SE_C_I,/issue/480474808


In [54]:
# Random user with prior interactions with the repo
e = 0.0
z_colab, na = 0, 0
inv_cnt, cnt = 0, 0
for i, x in enumerate(vd.values):
    if (i + 1) % 500 == 0:
        print(f'step {i + 1}:', e / (i + 1))
    repos = tr[tr['s'] == x[2]]['o'].unique()
    if len(repos) == 0:
        inv_cnt += 1
        continue
    n_repo = tr[tr['o'] == repos[0]]
    users = tr[tr['o'].isin(n_repo['s'].unique()) & tr['s'].str.startswith('/user/')]['s'].unique().tolist()
    users += n_repo[n_repo['s'].str.startswith('/user/')]['s'].unique().tolist()
    users = set(users)
    if len(users) == 0:
        z_colab += 1
        continue
    if x[0] not in users:
        na += 1
        continue
    e += 1 / len(users)
print(f'invalid count: {inv_cnt}')
print(f'zero colabs: {z_colab}')
print(f'not available: {na}')
print(f'expected mean accuracy:', e / len(vd.values))

step 500: 0.16512284292770701
step 1000: 0.16287390244015737
step 1500: 0.16399924659711135
step 2000: 0.1675902026728866
step 2500: 0.16955860293241873
step 3000: 0.1707500365874738
step 3500: 0.17291796607278181
step 4000: 0.17239775909940583
step 4500: 0.1714797176302106
step 5000: 0.17031431348523318
step 5500: 0.16971576084995763
step 6000: 0.16999367720012892
step 6500: 0.17109797319810824
step 7000: 0.17031209422738017
step 7500: 0.17027657742814967
step 8000: 0.16934902881813976
step 8500: 0.16907404851531105
step 9000: 0.16919872484982337
step 9500: 0.1689779919209152
step 10000: 0.1692373019501077
step 10500: 0.16957417546238432
step 11000: 0.1695840328604322
step 11500: 0.16838988991476897
step 12000: 0.16844790231545576
step 12500: 0.16832241078667623
invalid count: 741
zero colabs: 18
not available: 390
expected mean accuracy: 0.16829708320288428


In [55]:
# Random user with prior interactions with the repo on an issue
e = 0.0
z_colab, na = 0, 0
inv_cnt, cnt = 0, 0
for i, x in enumerate(vd.values):
    if (i + 1) % 500 == 0:
        print(f'step {i + 1}:', e / (i + 1))
    repos = tr[tr['s'] == x[2]]['o'].unique()
    if len(repos) == 0:
        inv_cnt += 1
        continue
    n_repo = tr[tr['o'] == repos[0]]
    users = tr[tr['o'].isin(n_repo['s'].unique()) &
               tr['o'].str.startswith('/issue/') &
               tr['s'].str.startswith('/user/')]['s'].unique().tolist()
    users += n_repo[n_repo['s'].str.startswith('/user/')]['s'].unique().tolist()
    users = set(users)
    if len(users) == 0:
        z_colab += 1
        continue
    if x[0] not in users:
        na += 1
        continue
    e += 1 / len(users)
print(f'invalid count: {inv_cnt}')
print(f'zero colabs: {z_colab}')
print(f'not available: {na}')
print(f'expected mean accuracy:', e / len(vd.values))

step 500: 0.1865768490517205
step 1000: 0.1860779657181395
step 1500: 0.1883122189513767
step 2000: 0.19193953326320173
step 2500: 0.19465405288374327
step 3000: 0.19705403189337525
step 3500: 0.19912944503039984
step 4000: 0.19825975788621156
step 4500: 0.19766685470760795
step 5000: 0.19628987569957243
step 5500: 0.1959485421590383
step 6000: 0.19562718378384397
step 6500: 0.1969190419897989
step 7000: 0.19597380282918928
step 7500: 0.19611378340355679
step 8000: 0.1953225329622385
step 8500: 0.1953177265255295
step 9000: 0.19534458080891284
step 9500: 0.19530767550329908
step 10000: 0.19555007805892655
step 10500: 0.19619111220329574
step 11000: 0.19635763200186562
step 11500: 0.19514823344808535
step 12000: 0.19547770659414054
step 12500: 0.1954125422158342
invalid count: 741
zero colabs: 25
not available: 445
expected mean accuracy: 0.19537137909165203


In [13]:
ir = tr[tr['o'].str.startswith('/repo/') & 
        tr['s'].str.startswith('/issue/')].groupby('s')['o'].apply(lambda x: list(x)[0]).reset_index(name='repo')
ir = ir.rename(columns={'s': 'issue'})

In [14]:
ir.head(n=5)

Unnamed: 0,issue,repo
0,/issue/104939982,/repo/14005591
1,/issue/105811463,/repo/858127
2,/issue/106218162,/repo/33014811
3,/issue/108627670,/repo/2797951
4,/issue/108865671,/repo/3337027


In [15]:
ir_idx = ir.set_index('issue').to_dict()['repo']

In [16]:
ri = tr[tr['o'].str.startswith('/repo/') & 
        tr['s'].str.startswith('/issue/')].groupby('o')['s'].apply(list).reset_index(name='issues')
ri = ri.rename(columns={'o': 'repo'})

In [17]:
ri.head(n=5)

Unnamed: 0,repo,issues
0,/repo/100035906,"[/issue/541292435, /issue/535848839, /issue/53..."
1,/repo/10003820,"[/issue/536922025, /issue/286717827, /issue/28..."
2,/repo/100038377,"[/issue/534999725, /issue/536421216, /issue/53..."
3,/repo/100052573,"[/issue/532553046, /issue/536822958, /issue/53..."
4,/repo/100059061,"[/issue/463842454, /issue/533439255, /issue/53..."


In [18]:
iu = tr[tr['s'].str.startswith('/user/') & 
        tr['r'].str.startswith('U_SE_C_I') &
        tr['o'].str.startswith('/issue/')].groupby('o')['s'].apply(list).reset_index(name='users')
iu = iu.rename(columns={'o': 'issue'})

In [19]:
iu.head(n=5)

Unnamed: 0,issue,users
0,/issue/108627670,[/user/383198]
1,/issue/114742518,[/user/202799]
2,/issue/115847098,[/user/11417]
3,/issue/120861497,[/user/606517]
4,/issue/122054030,[/user/214010]


In [20]:
iu_idx = iu.set_index('issue').to_dict()['users']

In [21]:
from collections import Counter
from itertools import chain

ri['users'] = ri.issues.apply(lambda x: list(map(lambda y: y[0], Counter(chain.from_iterable([iu_idx.get(z, []) for z in x])).most_common())))

In [22]:
ri.head(n=5)

Unnamed: 0,repo,issues,users
0,/repo/100035906,"[/issue/541292435, /issue/535848839, /issue/53...",[/user/36424503]
1,/repo/10003820,"[/issue/536922025, /issue/286717827, /issue/28...","[/user/619500, /user/500841]"
2,/repo/100038377,"[/issue/534999725, /issue/536421216, /issue/53...",[/user/20436557]
3,/repo/100052573,"[/issue/532553046, /issue/536822958, /issue/53...","[/user/5481178, /user/58055, /user/4175918, /u..."
4,/repo/100059061,"[/issue/463842454, /issue/533439255, /issue/53...",[/user/6392944]


In [23]:
iu = ir.merge(ri, on='repo', how='right')[['issue', 'users']]

In [24]:
iu.head(n=5)

Unnamed: 0,issue,users
0,/issue/104939982,"[/user/6207635, /user/673121]"
1,/issue/532937421,"[/user/6207635, /user/673121]"
2,/issue/537599668,"[/user/6207635, /user/673121]"
3,/issue/540607914,"[/user/6207635, /user/673121]"
4,/issue/105811463,"[/user/953992, /user/13159005, /user/10647082,..."


In [25]:
iu_idx = iu.set_index('issue').to_dict()['users']

In [26]:
tu = len(tr[tr['s'].str.startswith('/user/')]['s'].unique())

In [27]:
# Random user with prior interactions with the repo on an issue
mtr = Metric()
z_u, na, inv_cnt = 0, 0, 0
for i, x in enumerate(vd.values):
    repo = ir_idx.get(x[2], None)
    if repo is None:
        inv_cnt += 1
        mtr.update(tu // 2)
        continue
    users = iu_idx.get(x[2], [])
    if len(users) == 0:
        z_u += 1
        mtr.update(tu // 2)
        continue
    if x[0] not in users:
        na += 1
        mtr.update(tu // 2)
        continue
    r = users.index(x[0]) + 1
    mtr.update(r)
print(f'invalid count: {inv_cnt}')
print(f'zero colabs: {z_u}')
print(f'not available: {na}')
print(f'Metrics:', mtr)

invalid count: 275
zero colabs: 353
not available: 820
Metrics: 
H@1: 0.4749887336638125
H@3: 0.6018476791347454
H@10: 0.6559260928346102
MR: 5359.491888237945
MRR: 0.5440519219832841



In [6]:
import pandas as pd

tr = pd.read_csv('/Users/kahrabian/projects/KnowledgeGraphEmbedding/data/GitGraph_T_0.01/train.txt', sep='\t', names=['s', 'r', 'o', 't'])

In [7]:
tr.head(n=3)

Unnamed: 0,s,r,o,t
0,/user/2072976,U_SO_O_P,/pr/352015223,1576101177
1,/user/12582312,U_SO_O_P,/pr/350536299,1575889392
2,/issue/536626106,I_AO_O_R,/repo/199332790,1576118015


In [85]:
import os

args_data_path = '/Users/kahrabian/projects/KnowledgeGraphEmbedding/data/GitGraph_T_0.01'

train_data = pd.read_csv(os.path.join(args_data_path, 'train.txt'), sep='\t', names=['s', 'r', 'o', 't'])

repo_entity = train_data[train_data['o'].str.startswith('/repo/')][['o', 's']]
repo_entity = repo_entity.rename(columns={'o': 'repo', 's': 'entity'})

entity_users = train_data[train_data['s'].str.startswith('/user/')].groupby('o')['s'].apply(list)
entity_users = entity_users.reset_index(name='users').rename(columns={'o': 'entity'})

repo_users = repo_entity.merge(entity_users, on='entity', how='left')[['repo', 'users']]
repo_users['users'] = repo_users.users.apply(lambda x: x if type(x) == list else [])
repo_users = repo_users.groupby('repo')['users'].apply(lambda x: list(chain.from_iterable(x)))
repo_users = repo_users.reset_index(name='users')

valid_data = pd.read_csv(os.path.join(args_data_path, 'valid.txt'), sep='\t', names=['s', 'r', 'o', 't'])
test_data = pd.read_csv(os.path.join(args_data_path, 'test.txt'), sep='\t', names=['s', 'r', 'o', 't'])
all_data = pd.concat([train_data, valid_data, test_data])

issue_repo = all_data[all_data['o'].str.startswith('/repo/') & all_data['s'].str.startswith('/issue/')]
issue_repo = issue_repo.groupby('s')['o'].apply(lambda x: list(x)[0]).reset_index(name='repo')
issue_repo = issue_repo.rename(columns={'s': 'issue'})

issue_users = issue_repo.merge(repo_users, on='repo', how='left')[['issue', 'users']]
issue_users_idx = issue_users.set_index('issue').to_dict()['users']

In [93]:
for r in issue_repo.issue:
    if r == '/issue/534536236':
        print(r)
# issue_repo.issue.apply(lambda x: print(x) if x == '/issue/534536236' else None)

In [87]:
issue_users_idx['/issue/534536236']

KeyError: '/issue/534536236'