This notebook compares the rankings of systems (ROS) between editorial relevance labels and citation counts. The code below mainly uses the data from the "Scientific Abstracts" task at TREC Precision Medicine 2017. 

**Download TREC Precision Medicine run files.** 

In [None]:
!wget -O trec-pm.tar.xz https://th-koeln.sciebo.de/s/JTTV4fxFmuCGMeY/download trec-pm.tar.xz
!tar -xf trec-pm.tar.xz

**The directory includes the qrels and all runs submitted to the "Scientific Abstracts" and "Clinical Trials" tracks at TREC PM 2017-19**

see also: https://trec.nist.gov/data/precmed.html

In [None]:
!ls trec-pm

**Download Dirk's citation and altmetric data.**

In [None]:
!wget -O bibliometric.tar.xz https://th-koeln.sciebo.de/s/BRolGxMzrCipoTT/download
!tar -xf bibliometric.tar.xz

Install needed python dependencies

In [None]:
!conda install --yes --file requirements.txt

**Make a qrels file from the citation data. The following code uses simple criteria to make multi-graded labels from the citation count and writes them into a file 'qrels.cite'.**
- 2: if the number of citations is higher than twice the mean of all citations
- 0: if the number of citations is lower than the mean of all citations
- 1: the ones in between

In [None]:
import pandas as pd 

df = pd.read_csv('STI_Ergebnisse_final.txt', sep='\t')

_df = df[df['TC'].notna()]
_df = _df[_df['TOPIC'].str.contains('2017', regex=False)]
_df = _df[['TOPIC','PUBMED_ID', 'TC']]
thresh = df[df['TC'].notna()]['TC'].mean()

with open('qrels.cite', 'w') as f_out:

    for row in _df.iterrows():

        topic = row[1]['TOPIC'].split('-')[1]
        pubmed_id = row[1]['PUBMED_ID']
        citation_cnt = row[1]['TC']
        rel = 1
        
        if citation_cnt >= 2*thresh:
            rel = 2
        if citation_cnt < thresh:
            rel = 0
            
        line_out = ' '.join([topic, '0', str(pubmed_id), str(rel), '\n'])
                
        f_out.write(line_out)

**Extract the run files and write them into a new directory.**

In [None]:
import os
import gzip

def extract_runs(dir_in, dir_out):

    os.makedirs(dir_out, exist_ok=True)

    for root, dirs, files in os.walk(dir_in):
        for file in files:
            if file.endswith(".gz"):
                run_name = file.split('.')[1]
                with gzip.open(os.path.join(root, file), 'rb') as f_in:
                    file_content = f_in.read()
                    with open(dir_out + '/' + run_name, 'wb') as f_out:
                        f_out.write(file_content) 
                          
DIR_IN = 'trec-pm/trec-pm-2017-abstracts' 
DIR_OUT = 'runs/trec-pm-2017-abstracts'    
                    
extract_runs(DIR_IN, DIR_OUT)

**Install the super-fast evaluation toolkit ranx, which implements some trec_eval measures with the help of Python and numba.**

see also: https://github.com/AmenRa/ranx or https://amenra.github.io/ranx/

In [None]:
!pip install ranx

**Make a reference system of rankings (ROS) from the qrels of the "Scientific Abstracts" task at TREC PM 2017.**

The first time, it takes a while to run ranx as it needs to compile the source code. Later executions will run much faster.

In [None]:
from ranx import Qrels, Run, evaluate, compare

DIR_RUN = DIR_OUT
PATH_QRELS = "trec-pm/trec-pm-2017-abstracts/qrels-final-abstracts.txt"

qrels = Qrels.from_file(PATH_QRELS, kind="trec")

ros_ref = {}

for root, dirs, files in os.walk(DIR_RUN):
    for file in files:
        run = Run.from_file(os.path.join(root, file), kind="trec")
        score = evaluate(qrels, run, "ndcg@5")
        ros_ref[file] = score

ros_ref = dict(sorted(ros_ref.items(), key=lambda item: item[1], reverse=True))
ros_ref

**Make the corresponding ROS based on citation data.**

In [None]:
PATH_QRELS_CITE = "qrels.cite"

qrels = Qrels.from_file(PATH_QRELS_CITE, kind="trec")

ros_cite = {}

for root, dirs, files in os.walk(DIR_RUN):
    for file in files:
        run = Run.from_file(os.path.join(root, file), kind="trec")
        score = evaluate(qrels, run, "ndcg@5")
        ros_cite[file] = score

ros_cite = dict(sorted(ros_cite.items(), key=lambda item: item[1], reverse=True))
ros_cite

**Determine Kendall's tau between the ROS.**

In [None]:
from scipy import stats

tau, p_value = stats.kendalltau(list(ros_ref.keys()), list(ros_cite.keys()))
tau

**Classifier preparation**
- Prepare data, split columns
- Drop columns from data which contain non-numeric data
- Todo: transform non-numeric data to numeric data

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

_df2 = pd.read_csv('STI_Ergebnisse_final.txt', sep='\t', low_memory=False)

# insert citations per year
_df2['TCpY'] = _df2['TC']/(_df2['JAHR']-_df2['PY'])

# separate topic into ID and year
_df2['TOPIC_ID'] = _df2['TOPIC'].str[5:].astype(int)
_df2['TOPIC_YEAR'] = _df2['TOPIC'].str[:4].astype(int)

# replace non-numeric values with numeric ones
encoder = preprocessing.OrdinalEncoder()
_df2[['PT', 'DT', 'SO']] = encoder.fit_transform(_df2[['PT', 'DT', 'SO']])

# todo: handle columns 'DOMAIN', 'FIELD', 'SUBFIELD'

# drop non-numeric columns
drop_columns = ['UT', 'DOI', 'ISSN', 'ARXIVID', 'DOMAIN', 'FIELD', 'SUBFIELD', 'TOPIC']
_df2.drop(labels=drop_columns, axis=1, inplace=True)

# reformat strings to floats
_df2['RL'] = _df2['RL'].str.replace(',', '').astype(float)
_df2['IF'] = _df2['IF'].str.replace(',', '').astype(float)

# drop all rows where an inf or nan value occurs
_df2.replace([np.inf, -np.inf], np.nan, inplace=True)
_df2.dropna(inplace=True)

# scale values
# todo: handle all values
scaler = preprocessing.MinMaxScaler()
_df2[['JAHR', 'PY','TC', 'IF', 'RL', 'ATTENTION_SCORE', 'NEWS', 'BLOG', 'POLICY', 'PATENT', 'TWITTER',
       'PEER_REVIEW', 'WEIBO', 'FACEBOOK', 'WIKIPEDIA', 'GOOGLE', 'LINKEDIN',
       'REDDIT', 'PINTEREST', 'F1000', 'Q_A', 'VIDEO', 'SYLLABI', 'MENDELEY',
       'DIMENSIONS', 'TCpY', 'TOPIC_YEAR' ]] = scaler.fit_transform(_df2[['JAHR', 'PY','TC', 'IF', 'RL', 'ATTENTION_SCORE', 'NEWS', 'BLOG', 'POLICY', 'PATENT', 'TWITTER',
       'PEER_REVIEW', 'WEIBO', 'FACEBOOK', 'WIKIPEDIA', 'GOOGLE', 'LINKEDIN',
       'REDDIT', 'PINTEREST', 'F1000', 'Q_A', 'VIDEO', 'SYLLABI', 'MENDELEY',
       'DIMENSIONS', 'TCpY', 'TOPIC_YEAR' ]])

Index(['PUBMED_ID', 'RELEVANCE', 'JAHR', 'PY', 'PT', 'DT', 'SO', 'TC', 'IF',
       'RL', 'ATTENTION_SCORE', 'NEWS', 'BLOG', 'POLICY', 'PATENT', 'TWITTER',
       'PEER_REVIEW', 'WEIBO', 'FACEBOOK', 'WIKIPEDIA', 'GOOGLE', 'LINKEDIN',
       'REDDIT', 'PINTEREST', 'F1000', 'Q_A', 'VIDEO', 'SYLLABI', 'MENDELEY',
       'DIMENSIONS', 'TCpY', 'TOPIC_ID', 'TOPIC_YEAR'],
      dtype='object')


In [2]:
# relevance labels
rel = _df2['RELEVANCE']
y_train = rel.iloc[0:20000]
y_test = rel.iloc[20000:]

_df2.drop(labels='RELEVANCE', axis=1, inplace=True)

# actual data
X_train = _df2.values[0:20000]
X_test = _df2.values[20000:]

Train/validation split of data, try different learning rates

In [3]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

state = 123  
test_size = 0.30
  
# further split into train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size, random_state=state)

lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(learning_rate=learning_rate, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_val, y_val)))

Learning rate:  0.05
Accuracy score (training): 0.771
Accuracy score (validation): 0.772
Learning rate:  0.075
Accuracy score (training): 0.776
Accuracy score (validation): 0.774
Learning rate:  0.1
Accuracy score (training): 0.780
Accuracy score (validation): 0.775
Learning rate:  0.25
Accuracy score (training): 0.803
Accuracy score (validation): 0.778
Learning rate:  0.5
Accuracy score (training): 0.834
Accuracy score (validation): 0.776
Learning rate:  0.75
Accuracy score (training): 0.853
Accuracy score (validation): 0.770
Learning rate:  1
Accuracy score (training): 0.863
Accuracy score (validation): 0.762


Test classifier's performance

In [4]:
from sklearn.metrics import classification_report, confusion_matrix

gb_clf2 = GradientBoostingClassifier(learning_rate=0.25, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix:
[[7997  272   87]
 [ 979  274   17]
 [1221  116   66]]
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.96      0.86      8356
           1       0.41      0.22      0.28      1270
           2       0.39      0.05      0.08      1403

    accuracy                           0.76     11029
   macro avg       0.53      0.41      0.41     11029
weighted avg       0.69      0.76      0.70     11029

