In [98]:
import pandas as pd
from helpers import *
import os

In [99]:
decision_df = pd.DataFrame(columns=['DOI', 'decision', 'rationale', 'stage'])

In [100]:
NUM_OF_PUB = 284

In [101]:
OUTPUT = 'decision_tree_output'
if not os.path.exists(OUTPUT):
    os.makedirs(OUTPUT)

In [102]:
# file paths
CITATION_CONTEXT_FILE_PARAGRAPH = 'input_data/metadata.csv'
KEYWORD_DICT_FILE = 'input_data/keyword_dictionary.csv'
CITATION_CONTEXT_FILE_SENTENCE = 'input_data/citation_context_sentence.csv'

## Stage 1

In [103]:
data_raw = pd.read_csv(CITATION_CONTEXT_FILE_PARAGRAPH, nrows=401)
data = dataset_socket(data_raw)
data_s1 = dataset_socket(data_raw)
data_s1.drop(columns=['citation_marker', 'paragraph', 'section'], inplace=True)
data_s1.drop_duplicates(inplace=True)

In [104]:
assert data_s1.shape[0] == NUM_OF_PUB # should be 284

In [105]:
Q1_out = data_s1.loc[(data_s1['Q1 - Review article? (WoS)'] == 'y') | (data_s1['Q1 - Review article? (Scopus)'] == 'y')]

In [106]:
print(f'Q1 triage out {Q1_out.shape[0]} ({int(Q1_out.shape[0]/NUM_OF_PUB*100)}%) articles')

Q1 triage out 31 (10%) articles


In [107]:
temp = pd.DataFrame({'DOI': Q1_out['DOI'],
                     'decision': 'negligible risk',
                     'rationale': 'This paper is a review',
                     'stage': '1'})
decision_df = pd.concat((decision_df, temp), axis=0).reset_index()

In [108]:
data_s1.drop(inplace=True, labels=Q1_out.index)
data_s1.shape

(253, 5)

In [109]:
Q2_out = data_s1.loc[(data_s1['Q2 - Addendum or Neupane? (WoS)'] == 'y') | (data_s1['Q2 - Addendum or Neupane? (Scopus)'] == 'y'),]

In [110]:
print(f'Q2 triage out {Q2_out.shape[0]} ({int(Q2_out.shape[0]/NUM_OF_PUB*100)}%) articles')

Q2 triage out 13 (4%) articles


In [111]:
temp = pd.DataFrame({'DOI': Q2_out['DOI'],
                     'decision': 'negligible risk',
                     'rationale': 'This paper cited Addendum or Neupane et al. 2019',
                     'stage': '1'})
decision_df = pd.concat((decision_df, temp), axis=0).reset_index(drop=True)

In [112]:
data_s1.drop(inplace=True, labels=Q2_out.index)

In [113]:
data_s1.shape

(240, 5)

In [114]:
num_triage_out = len(decision_df)
print(f'Stage 1 triage out {num_triage_out} ({int(num_triage_out/NUM_OF_PUB*100)}%) articles')

Stage 1 triage out 44 (15%) articles


## Stage 2

In [115]:
data_s2 = data.merge(data_s1['DOI'], on='DOI', how='right')
data_s2.shape

(335, 8)

In [116]:
assert len(set(data_s2['DOI'].to_list())) == 240 # check whether the merge is correct

In [117]:
pub_dict = data_ingestion_s2(data_s2)

In [118]:
pub_doi_l = []
pub_decision_l = []
for key, value in pub_dict.items():
    pub_doi_l.append(key)
    pub_decision_l.append(is_only_in_introduction(value))

In [119]:
Q3_out = pd.DataFrame({'DOI': pub_doi_l,
                       'Is only in introduction': pub_decision_l})
Q3_out = Q3_out[Q3_out['Is only in introduction'] == True]

In [120]:
print(f'Q3 triage out {Q3_out.shape[0]} ({int(Q3_out.shape[0]/NUM_OF_PUB*100)}%) articles')

Q3 triage out 37 (13%) articles


In [121]:
temp = pd.DataFrame({'DOI': Q3_out['DOI'],
                     'decision': 'negligible risk',
                     'rationale': 'Citations only appear in the introduction section',
                     'stage': '2'})
decision_df = pd.concat((decision_df, temp), axis=0).reset_index(drop=True)
num_triage_out = len(decision_df) - num_triage_out

In [122]:
print(f'Stage 2 triage out {num_triage_out} ({int(num_triage_out/NUM_OF_PUB*100)}%) articles')

Stage 2 triage out 37 (13%) articles


In [123]:
print(f'Still {NUM_OF_PUB - len(decision_df)} articles to triage') # remaining articles

Still 203 articles to triage


## Stage 3 Keyword Approach

In [124]:
cc_df = pd.read_csv(CITATION_CONTEXT_FILE_SENTENCE, encoding='utf-8')
# cc_df = pd.read_csv('data/citation_context_Yuanxi_updated_trimmed.csv', encoding='utf-8')

In [125]:
cc_df['DOI'] = cc_df['DOI'].apply(clean)

In [126]:
assert len(set(cc_df['DOI'].to_list())) == 203 # test whether the merge was correct

In [127]:
# stage 3 data ingestion -- keyword approach
pub_dict = OrderedDict()

for _, row in cc_df.iterrows():
    DOI = row['DOI']

    if DOI not in pub_dict:
        pub_dict[DOI] = {'citation_context': []}

    pub_dict[DOI]['citation_context'].append({'sent': row['Citation context (sent)']})

In [128]:
keyword_dict = keyword_dict_construction(KEYWORD_DICT_FILE)

In [129]:
pub_doi_l = []
pub_decision_l = []
pub_rationale_l = []

for key, value in pub_dict.items():
    detection_dict = keyword_detection(value, keyword_dict) 
    pub_doi_l.append(key)
    
    if len(detection_dict['elevating']) > 0:
        pub_decision_l.append('high risk')
        pub_rationale_l.append(f'Contain risk-elevating keyword {detection_dict["elevating"]}')
                               
    elif detection_dict['all_reducing']:
        pub_decision_l.append('negligible risk')
        pub_rationale_l.append('All citation contexts contain risk-reducing keywords')
    else:
        pub_decision_l.append('medium risk')
        pub_rationale_l.append('neither risk-elevating nor risk-reducing attributes detected ')
        

In [130]:
temp = pd.DataFrame({'DOI': pub_doi_l,
                     'decision': pub_decision_l,
                     'rationale': pub_rationale_l,
                     'stage': '3'})
decision_df_keywords = pd.concat((decision_df, temp), axis=0).reset_index(drop=True)
decision_df_keywords.shape

(284, 5)

In [131]:
num_triage_out = pub_decision_l.count('negligible risk')
print(f'Stage 3 with the keyword approach triage out {num_triage_out} ({int(num_triage_out/NUM_OF_PUB*100)}%) articles')

Stage 3 with the keyword approach triage out 50 (17%) articles


In [132]:
assert decision_df_keywords.shape[0] == NUM_OF_PUB

### Output results as decision_df_keywords.csv

In [133]:
decision_df_keywords.drop(columns=['index']).to_csv('decision_tree_output/decision_df_keywords.csv', index=False, encoding='utf-8-sig')

In [134]:
assert len(set(decision_df_keywords['DOI'].to_list())) == NUM_OF_PUB

In [135]:
decision_df_keywords.head()

Unnamed: 0,index,DOI,decision,rationale,stage
0,10.0,10.1002/chem.201601150,negligible risk,This paper is a review,1
1,15.0,10.1002/chem.202004790,negligible risk,This paper is a review,1
2,36.0,10.1002/mrc.4989,negligible risk,This paper is a review,1
3,39.0,10.1002/mrc.5234,negligible risk,This paper is a review,1
4,41.0,10.1002/mrc.5260,negligible risk,This paper is a review,1


## Stage 3 Machine Learning Approach

In [136]:
tilo_df = pd.read_csv('input_data/LSTM_prediction_203.csv')

In [137]:
tilo_df.head()

Unnamed: 0,CC_ID,DOI,Citation context (sent),Citation context level annotation,Predicted label
0,CC1,10.1002/anie.201706532,The validity of structure 5 was confirmed by D...,Y,True
1,CC2,10.1002/anie.201708266,"To the best of our knowledge, such a composite...",N,False
2,CC3,10.1002/anie.201708266,Taking a Boltzmann average of NMR parameters c...,N,False
3,CC4,10.1002/anie.201810566,"To resolve this ambiguity, we conducted NMR pr...",Y,True
4,CC5,10.1002/anie.201902777,There are now numerous in silico methods to he...,N,False


In [138]:
# stage 3 data ingestion -- ML approach
pub_dict = OrderedDict()

for _, row in tilo_df.iterrows():
    DOI = row['DOI']
    # label = row['Predicted label'].strip(' [').strip(']')
    label = row['Predicted label']
    
    # convert string to boolean value
    # if label == 'True':
    #     label = True
    # elif label == 'False':
    #     label = False
    
    if DOI not in pub_dict:
        pub_dict[DOI] = {'citation_context': []}

    pub_dict[DOI]['citation_context'].append({'sent': row['Citation context (sent)'], 'classification': label})

In [139]:
row['Predicted label']

False

In [140]:
pub_decision_l = []
pub_doi_l = []
pub_rationale_l = []

for key, value in pub_dict.items():
    
    pub_doi_l.append(key)
    boolean_value_l = [x['classification'] for x in value['citation_context']]
    boolean_sum = sum(boolean_value_l)
    
    if boolean_sum > 0:
        pub_decision_l.append('high risk')
        pub_rationale_l.append(f'One or more citation contexts classified as at risk by the machine learning model')
    else:
        pub_decision_l.append('negligible risk')
        pub_rationale_l.append(f'No citation context classified as at risk by the machine learning model')

In [141]:
Tilo_decision_df = pd.DataFrame({'DOI': pub_doi_l,
                     'decision': pub_decision_l,
                     'rationale': pub_rationale_l,
                     'stage': '3'})

### output results as decision_df_tio.csv

In [142]:
Tilo_decision_df.to_csv('decision_tree_output/decision_df_ml.csv', index=False, encoding='utf-8-sig')