## BDEX Data Extraction

### Prereq:
- Python3.7+
- Run on `{BQNT /I:NOBOX <GO>}` and Trusted Kernel.
- Install the library first using:
    - !pip install aiohttp
    - !pip install minio

In [1]:
print("Please specify how many weeks of data you would like to extract:")
############# To manually change input ###################
number_of_weeks = 5
############# To manually change input ###################
print("You've specified "+str(number_of_weeks)+" weeks.")
print("Then please run all cells and find output in the folder bdex_output.")

Please specify how many weeks of data you would like to extract:
You've specified 5 weeks.
Then please run all cells and find output in the folder bdex_output.


### Main:

In [2]:
!pip install aiohttp

Looking in indexes: https://artifactory.inf.bloomberg.com/artifactory/api/pypi/bloomberg-pypi/simple


In [3]:
!pip install minio

Looking in indexes: https://artifactory.inf.bloomberg.com/artifactory/api/pypi/bloomberg-pypi/simple


In [4]:
import copy
from src.bdex_fe import (
    apply_and_filter,
    bdex_search,
    get_tags,
    GetData,
    PreProcessor,
)
from src.bdex_fe.products import bnef

In [5]:
query_tags = get_tags(product="bnef", number_of_weeks=number_of_weeks)
query_tags

['dskt2.AswoNewsStory.week.202229',
 'dskt2.AswoNewsStory.week.202228',
 'dskt2.AswoNewsStory.week.202227',
 'dskt2.AswoNewsStory.week.202226',
 'dskt2.AswoNewsStory.week.202225']

In [6]:
# from sample_data import all_recs
all_recs = await bdex_search(
    query_tags,
    proxy=None  # not needed on (?:nj|ny)lxdev\d or CORP
)

In [7]:
TAG_FILTERS = {  # example
    "and_tags": [  # contains all
#         "dskt2.AswoNewsStory.language.ENGLISH",
#         "dskt2.AswoNewsStory.jflo.indexed.true"
    ],
    "or_tags": [  # contains any
        
    ],
    "no_tags": [  # contains none
        
    ],
    "any_substr": [  # contains in any
#         "AswoNewsStory"
    ],
}
GET_DATA = {
    "doc_needed": False,
    "source_data_needed": False,
    # if any of the above are True, fill these:
    "bcos_account": None,
    "bcos_secret": None,
    "minio_access": None,
    "minio_secret": None
}

functions = []
preproc = PreProcessor(**TAG_FILTERS)
get_data_client = GetData(**GET_DATA)

product_specific_extraction_functions = [bnef.extract_suid, bnef.extract_resolution]


functions = (preproc.functions +
             product_specific_extraction_functions +
             get_data_client.functions +
             [])  # potential tokenisers

In [8]:
recs = copy.deepcopy(all_recs)
for f in functions:
    apply_and_filter(recs, f)

processing extract_tags                 :  15379 records |███████████| 100%[-00:00,  223366.42 records/s] 00:00
processing extract_suid                 :  15379 records |███████████| 100%[-00:00,  405799.10 records/s] 00:00
processing extract_resolution           :  15379 records |███████████| 100%[-00:00,  466617.00 records/s] 00:00


In [9]:
recs[0]

{'tags': ['dskt2.AswoNewsStory.BNEF-1008909.resolution_time.2022-07-20T09:03:50.698Z',
  'dskt2.AswoNewsStory.BNEF-1008909.region.EMEA',
  'dskt2.AswoNewsStory.BNEF-1008909.asset_id.NA',
  'dskt2.AswoNewsStory.jflo.indexed.true',
  'dskt2.AswoNewsStory.BNEF-1008909.resolution.Rejected',
  'dskt2.AswoNewsStory.BNEF-1008909.financing_id.NA',
  'dskt2.AswoNewsStory.jflo.week.202229',
  'dskt2.AswoNewsStory.BNEF-1008909.reason.Record already up-to-date',
  'dskt2.AswoNewsStory.jflo.issue.BCE-29146',
  'dskt2.AswoNewsStory.jflo.issue.BNEF-1008909',
  'dskt2.AswoNewsStory.jflo.created.true',
  'dskt2.AswoNewsStory.wireCode.ANW',
  'dskt2.AswoNewsStory.spam_score.5',
  'dskt2.AswoNewsStory.headline.Renewables Infrastructure invests more in Hornsea wind farm; holds 10%',
  'dskt2.AswoNewsStory.wireId.2694',
  'dskt2.AswoNewsStory.suid.RFB8GQC5V4ZK',
  'dskt2.AswoNewsStory.week.202229',
  'dskt2.AswoNewsStory.region.EMEA',
  'dskt2.AswoNewsStory.niCodes.GDNEFWIND',
  'dskt2.AswoNewsStory.arriva

### Exporting as a Table via Reverse Engineering

In [10]:
import pandas as pd
from datetime import date
import re

today = date.today()

def tags_list_to_dict(cell):
    tags_dict = dict()
    issue_info_list = [] # to store info starting with 'dskt2.AswoNewsStory.BNEF-xxxxxx.'
    for i in cell:
        #jflo related tags
        if i.startswith('dskt2.AswoNewsStory.jflo.indexed.'):
            tags_dict['jflo.indexed'] = i.partition('dskt2.AswoNewsStory.jflo.indexed.')[2]
        if i.startswith('dskt2.AswoNewsStory.jflo.created.'):
            tags_dict['jflo.created'] = i.partition('dskt2.AswoNewsStory.jflo.created.')[2]  
        if i.startswith('dskt2.AswoNewsStory.jflo.week.'):
            tags_dict['jflo.week'] = i.partition('dskt2.AswoNewsStory.jflo.week.')[2] 
            
        # non-archived
        if i.startswith('dskt2.AswoNewsStory.jflo.issue.'):
            # there might be multiple issues per source, recording the existence of such
            jflo_id = i.partition('dskt2.AswoNewsStory.jflo.issue.')[2]  
            if 'jflo.issue' in tags_dict.keys():
                if str(tags_dict['jflo.issue']) != str(jflo_id):
                    tags_dict['multiple_issue'] = "Yes"
                    tags_dict['jflo.issue'] = jflo_id # regard the one with 'jflo.issue' prefix as the dominant one
            else:
                tags_dict['jflo.issue'] = jflo_id
        # archived: here we take the first BNEF id in the string as 'jflo.issue'
        if i.startswith('dskt2.AswoNewsStory.BNEF-'):
            m = re.search('dskt2.AswoNewsStory.BNEF-(.+?)\.', i)
            if m:
                jflo_id = 'BNEF-'+m.group(1)            
                # there might be multiple issues per source, recording the existence of such
                if 'jflo.issue' in tags_dict.keys():
                    if str(tags_dict['jflo.issue']) != str(jflo_id):
                        tags_dict['multiple_issue'] = "Yes"
                else:
                    tags_dict['jflo.issue'] = jflo_id  
                    
        #source related tags
        if i.startswith('dskt2.AswoNewsStory.wireId.'):
            tags_dict['wireId'] = i.partition('dskt2.AswoNewsStory.wireId.')[2]
        if i.startswith('dskt2.AswoNewsStory.wireCode.'):
            tags_dict['wireCode'] = i.partition('dskt2.AswoNewsStory.wireCode.')[2]        
        if i.startswith('dskt2.AswoNewsStory.class.'):
            tags_dict['class'] = i.partition('dskt2.AswoNewsStory.class.')[2]
        if i.startswith('dskt2.AswoNewsStory.suid.'):
            tags_dict['tags_suid'] = i.partition('dskt2.AswoNewsStory.suid.')[2]
        if i.startswith('dskt2.AswoNewsStory.headline.'):
            tags_dict['headline'] = i.partition('dskt2.AswoNewsStory.headline.')[2]        
        if i.startswith('dskt2.AswoNewsStory.topicClusterId.'):
            tags_dict['topicClusterId'] = i.partition('dskt2.AswoNewsStory.topicClusterId.')[2]     
        if i.startswith('dskt2.AswoNewsStory.niCodes.'):
            tags_dict['niCodes'] = i.partition('dskt2.AswoNewsStory.niCodes.')[2]          
        if i.startswith('dskt2.AswoNewsStory.language.'):
            tags_dict['language'] = i.partition('dskt2.AswoNewsStory.language.')[2]           
        if i.startswith('dskt2.AswoNewsStory.arrivalTime.'):
            tags_dict['arrivalTime'] = i.partition('dskt2.AswoNewsStory.arrivalTime.')[2]  
        #other info
        if i.startswith('dskt2.AswoNewsStory.region_score.'):
            tags_dict['region_score'] = i.partition('dskt2.AswoNewsStory.region_score.')[2]    
        if i.startswith('dskt2.AswoNewsStory.region.'):
            tags_dict['region'] = i.partition('dskt2.AswoNewsStory.region.')[2]          
        if i.startswith('dskt2.AswoNewsStory.spam_score.'):
            tags_dict['spam_score'] = i.partition('dskt2.AswoNewsStory.spam_score.')[2] 
        if i.startswith('dskt2.AswoNewsStory.week.'):
            tags_dict['week'] = i.partition('dskt2.AswoNewsStory.week.')[2]   
        #currently not including details related to individual jflo issues that were clones
        #individual issue info
        if i.startswith('dskt2.AswoNewsStory.BNEF-'):
            issue_info_list.append(i.partition('dskt2.AswoNewsStory.')[2])    
    #iterate to record key JFLO issue details into dictionary
    if 'jflo.issue' in tags_dict:
        prefix = tags_dict['jflo.issue'] 
        for i in issue_info_list:
            if i.startswith(prefix + '.region.'):
                tags_dict['jflo_region'] = i.partition(prefix + '.region.')[2]   
            if i.startswith(prefix + '.asset_id.'):
                tags_dict['jflo_asset_id'] = i.partition(prefix + '.asset_id.')[2]             
            if i.startswith(prefix + '.resolution_time.'):
                tags_dict['jflo_resolution_time'] = i.partition(prefix + '.resolution_time.')[2]             
            if i.startswith(prefix + '.reason.'):
                tags_dict['jflo_reason'] = i.partition(prefix + '.reason.')[2]               
            if i.startswith(prefix + '.financing_id.'):
                tags_dict['jflo_financing_id'] = i.partition(prefix + '.financing_id.')[2]    
            if i.startswith(prefix + '.resolution.'):
                tags_dict['jflo_resolution'] = i.partition(prefix + '.resolution.')[2]  
    return tags_dict

In [11]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [12]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

comb = pd.DataFrame()
for r in recs:
    temp = pd.DataFrame(list(r.items())).set_index(0).transpose()
    comb = comb.append(temp)
#unstack tags content into multiple columns    
res = comb['tags'].apply(tags_list_to_dict).apply(pd.Series)
res = pd.concat([comb.drop(['tags'], axis=1), res], axis=1)
res.head()

Unnamed: 0,doc_sha,document,sourcedata,mime_type,asof,suid,resolution,jflo.issue,jflo.indexed,jflo.week,multiple_issue,jflo.created,wireCode,spam_score,headline,wireId,tags_suid,week,region,niCodes,arrivalTime,topicClusterId,language,region_score,class,jflo_resolution_time,jflo_region,jflo_asset_id,jflo_resolution,jflo_financing_id,jflo_reason
1,2452d403c99cd5fb3965866de04786bbf5ab5696b93653...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-07-20T08:19:39.453000+00:00,RFB8GQC5V4ZK,Rejected,BNEF-1008909,True,202229,Yes,True,ANW,5,Renewables Infrastructure invests more in Horn...,2694,RFB8GQC5V4ZK,202229,EMEA,GDNEFWIND,2022-07-20T08:19:38.591Z,TkMBfAOpzcuDdu0trKThVoA==,ENGLISH,75,1,2022-07-20T09:03:50.698Z,EMEA,,Rejected,,Record already up-to-date
1,aed8fbe30176a5142f0928d4bf6e8c183f5e70ef80c506...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-07-19T16:13:50.330000+00:00,RF9ZR1MB2SJS,Rejected,BCE-28802,True,202229,Yes,True,NS5,30,Baltic Times: Hydrometeorological measurements...,1810,RF9ZR1MB2SJS,202229,EMEA,GDNEFWIND,2022-07-19T16:13:49.550Z,WhZuFjixF3nFQBroxM59EPQ==,ENGLISH,100,111,,,,,,
1,32e721d678d18e151bad9391167c04d995ef469cb14fa1...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-07-18T11:39:58.352000+00:00,RF7RHRDWX2PS,Completed,BCE-28133,True,202229,Yes,True,BFW,6,Siemens Gamesa to Supply 277MW Swedish Onshore...,2345,RF7RHRDWX2PS,202229,EMEA,GDNEFWIND,2022-07-18T11:39:57.579Z,RF7RHRDWX2PS01,ENGLISH,100,3,,,,,,
1,f7a4ceecf569fe657a051a4d6983fbf5fadd454edb0213...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-07-19T02:32:59.019000+00:00,RF8XQWTVI5MR,Rejected,BCE-28397,True,202229,Yes,True,SNA,false,问董秘: 投资者提问：董秘您好，请问日后公司业务侧重点是否转移至储能业务？公司关于储...,3306,RF8XQWTVI5MR,202229,false,GDNEFESS,2022-07-19T02:32:56.108Z,RF8XQWTVI5MR,CHINESE_SIMP,false,3,,,,,,
1,378cd9bcdba612ab58c8c68e92b86dc55d257738de6994...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-07-19T07:51:39.987000+00:00,RF9CHYMB2SJZ,Rejected,BNEF-1008046,True,202229,Yes,True,NS6,false,搜狐新闻: 约11.76亿！这家整机商中标华能海上风电项目,1811,RF9CHYMB2SJZ,202229,false,GDNEFWIND,2022-07-19T07:51:34.485Z,RF9CHYMB2SJZ,CHINESE_SIMP,false,12252,2022-07-20T01:56:24.163Z,APAC,,Rejected,,Record already up-to-date


In [13]:
# to avoid Excel's limit of 65,530 URLS per worksheet
writer = pd.ExcelWriter('bdex_output/res-'+str(today)+'.xlsx', engine='xlsxwriter',options={'strings_to_urls': False})
res.to_excel(writer)
writer.close()

In [14]:
#filter jflo_Resolution = "Rejected"
newRes=res[res['jflo_resolution']=='Rejected']

In [15]:
#find duplicate of combinations (class+wireId) from rejected 
newRes=newRes[newRes.duplicated(['class','wireId'], keep=False)]
newRes

Unnamed: 0,doc_sha,document,sourcedata,mime_type,asof,suid,resolution,jflo.issue,jflo.indexed,jflo.week,multiple_issue,jflo.created,wireCode,spam_score,headline,wireId,tags_suid,week,region,niCodes,arrivalTime,topicClusterId,language,region_score,class,jflo_resolution_time,jflo_region,jflo_asset_id,jflo_resolution,jflo_financing_id,jflo_reason
1,2452d403c99cd5fb3965866de04786bbf5ab5696b93653...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-07-20T08:19:39.453000+00:00,RFB8GQC5V4ZK,Rejected,BNEF-1008909,true,202229,Yes,true,ANW,5,Renewables Infrastructure invests more in Horn...,2694,RFB8GQC5V4ZK,202229,EMEA,GDNEFWIND,2022-07-20T08:19:38.591Z,TkMBfAOpzcuDdu0trKThVoA==,ENGLISH,75,1,2022-07-20T09:03:50.698Z,EMEA,,Rejected,,Record already up-to-date
1,378cd9bcdba612ab58c8c68e92b86dc55d257738de6994...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-07-19T07:51:39.987000+00:00,RF9CHYMB2SJZ,Rejected,BNEF-1008046,true,202229,Yes,true,NS6,false,搜狐新闻: 约11.76亿！这家整机商中标华能海上风电项目,1811,RF9CHYMB2SJZ,202229,false,GDNEFWIND,2022-07-19T07:51:34.485Z,RF9CHYMB2SJZ,CHINESE_SIMP,false,12252,2022-07-20T01:56:24.163Z,APAC,,Rejected,,Record already up-to-date
1,71c3ecdbefcdff99409f6c30acde81790f1ef30c808a33...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-07-20T13:38:29.762000+00:00,RFBN84BQ99TS,Rejected,BNEF-1009192,true,202229,Yes,true,DIV,3,Bureau of Land Management OKs construction sta...,130,RFBN84BQ99TS,202229,AMER,GDNEFSOLAR,2022-07-20T13:38:28.682Z,RFBN84BQ99TS,ENGLISH,100,1,2022-07-20T14:05:25.506Z,AMER,,Rejected,,Record already up-to-date
1,e053a9ed79036bf4a839c2795c23e6d8a2dfd0e5e5b895...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-07-19T07:16:29.517000+00:00,RF9AVDMB2SJS,Rejected,BNEF-1007995,true,202229,Yes,true,NS6,false,同花顺财经: 顺发恒业建成国内首台20赫兹低频风机,1811,RF9AVDMB2SJS,202229,false,GDNEFWIND,2022-07-19T07:16:25.659Z,RF9AVDMB2SJS,CHINESE_SIMP,false,50712,2022-07-20T03:38:46.197Z,APAC,,Rejected,,Record already up-to-date
1,c5c767c02a31e508ce4ebfd7d0838ae0c53a9c93602391...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-07-19T11:41:55.121000+00:00,RF9N5UMB2SJL,Rejected,BNEF-1008234,true,202229,Yes,true,WE3,72,Energy Voice: UK government approves 8GW of of...,353,RF9N5UMB2SJL,202229,EMEA,GDNEFWIND,2022-07-19T11:41:54.358Z,RF9N5UMB2SJL,ENGLISH,100,224,2022-07-19T12:00:17.041Z,EMEA,,Rejected,,Record already up-to-date
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,b038246f94cc53ba8d1741be7aec335e95f201cc1af2d9...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-06-21T08:00:26.636000+00:00,RDTI8P0799MO,Rejected,BNEF-987811,true,202225,,true,DJ,67,PRESS RELEASE: hep on track for 2022: around 8...,2546,RDTI8P0799MP,202225,false,GDNEFSOLAR,2022-06-21T08:00:25.417Z,TBkh1VCRSudYIz/6KO7Slgg==,ENGLISH,false,301,2022-06-21T10:07:10.619Z,EMEA,,Rejected,,Duplicate work item
1,13dba31da5da0192161f12186368dd5ab1a9c202cd171d...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-06-21T07:42:00.291000+00:00,RDTHDXTVI5MP,Rejected,BNEF-987793,true,202226,,true,SNA,false,媒体滚动: 北京丰台站改扩建历程,3306,RDTHDXTVI5MP,202225,false,GDNEFSOLAR,2022-06-21T07:41:57.192Z,RDSWIRTP3SHU,CHINESE_SIMP,false,3,2022-06-28T08:32:00.518Z,APAC,,Rejected,,Duplicate work item
1,3cd952f355f6fccfd3be06f866350e90ef0c635fc4d450...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-06-23T12:19:57.710000+00:00,RDXJL8MB2SK7,Rejected,BNEF-989629,true,202225,,true,NS3,8,Energy Live News: CS Energy awarded three util...,1808,RDXJL8MB2SK7,202225,AMER,GDNEFSOLAR,2022-06-23T12:19:56.841Z,WspiWKr0qb98xDIEaWyhMXQ==,ENGLISH,100,11496,2022-06-23T13:38:16.124Z,AMER,,Rejected,,Record already up-to-date
1,5cd4a7e0ae554a383c9e8d41a6093cb030960164e21291...,https://bcos.prod.blpprofessional.com/v1/dt.ex...,[],text/html,2022-06-21T11:48:50.927000+00:00,RDTSTBMB2SKA,Rejected,BNEF-988063,true,202225,Yes,true,NS6,false,大楚网: 阳光也能卖钱？《十堰日报》头条聚焦郧西这一产业！,1811,RDTSTBMB2SKA,202225,false,GDNEFSOLAR,2022-06-21T11:48:47.733Z,RDTSTBMB2SKA,CHINESE_SIMP,false,11639,2022-06-22T09:20:34.634Z,APAC,,Rejected,,Record already up-to-date


In [16]:
#newRes=newRes.drop_duplicates(subset=['class', 'wireId'])
#newRes

In [17]:
#export to excel
#newRes.to_csv(r'C:\Users\jlee3524\Desktop\newRes.csv', index=False)

In [30]:
duplicateCount= newRes.pivot_table(index = ['wireId', 'class'], aggfunc ='size')
duplicateCount

wireId  class
1028    50         2
1103    51         5
1172    50         2
1182    51         2
119     100        3
12      207        2
        54         2
1275    52         3
130     1          3
1349    427        8
        451        3
1356    918        3
1365    124        3
1391    33         2
1415    234        2
1422    526        2
1448    1          2
1481    60         6
        61         4
1517    1          3
        4          8
1519    127        3
1559    1          6
159     51         3
1590    62         2
        63         6
1806    10342      2
        10553      2
        10692      3
        11035      2
        11155      2
        11766     13
        12292      6
        13079      2
        13517      2
        14061      2
        14616      2
        16514    101
        17503     19
        17639      8
        17749     16
        18382      2
        18680     36
        18683      2
        18689     42
        18770      2
        18892      3

In [19]:
#duplicateCount=duplicateCount.to_frame()

In [20]:
#type(duplicateCount)

In [21]:
#duplicateCount

In [22]:
#duplicateCount.count()

In [31]:
duplicateCount=duplicateCount.reset_index()
duplicateCount.columns = ['wireId', 'class', 'count']

In [32]:
import numpy as np
duplicateCount['rejection%']=(duplicateCount['count']/res.shape[0]*100)
duplicateCount

Unnamed: 0,wireId,class,count,rejection%
0,1028,50,2,0.032944
1,1103,51,5,0.082359
2,1172,50,2,0.032944
3,1182,51,2,0.032944
4,119,100,3,0.049415
5,12,207,2,0.032944
6,12,54,2,0.032944
7,1275,52,3,0.049415
8,130,1,3,0.049415
9,1349,427,8,0.131774


In [25]:
#duplicateCount['rejection%'].mean()
#duplicateCount['rejection rate%'].min()
#duplicateCount['rejection%'].max()

In [33]:
#duplicateCount.drop(duplicateCount[duplicateCount['rejection%'] < 0.15].index, inplace = True)
#majorRej=duplicateCount
#majorRej

Unnamed: 0,wireId,class,count,rejection%
31,1806,11766,13,0.214133
37,1806,16514,101,1.663647
38,1806,17503,19,0.312963
40,1806,17749,16,0.263548
42,1806,18680,36,0.592983
44,1806,18689,42,0.691814
50,1806,50943,11,0.181189
51,1806,50971,50,0.823588
54,1806,6122,28,0.461209
61,1807,10393,19,0.312963


In [34]:
duplicateCount.drop(duplicateCount[duplicateCount['count'] < 10].index, inplace = True)
majorReject=duplicateCount
majorRej

Unnamed: 0,wireId,class,count,rejection%
31,1806,11766,13,0.214133
37,1806,16514,101,1.663647
38,1806,17503,19,0.312963
40,1806,17749,16,0.263548
42,1806,18680,36,0.592983
44,1806,18689,42,0.691814
50,1806,50943,11,0.181189
51,1806,50971,50,0.823588
54,1806,6122,28,0.461209
61,1807,10393,19,0.312963


In [35]:
#percent of rejections that could be avoided
duplicateCount['rejection%'].sum()

42.777137209685385

In [None]:
#count if column[jflo_resolution=='Completed'] and is in majorRej
#checks how many completed items we would get rid of if we take away the source / see the success(completed) rate for the pairs with major rejects