# Introduction
State notebook purpose here

### Imports
Import libraries and write settings here.

In [28]:
# Data manipulation
import pandas as pd
import numpy as np
import pickle
import time
from IPython.display import clear_output
import datetime
from Bio import Medline
from collections import Counter
from pathlib import Path

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30


In [29]:
# check for class_df.p
files = [file.name for file in Path('./').iterdir()]
if 'class_df.p' in files:
    print('Screening has already begun, loading the working df')
    df = pickle.load(open('./class_df.p', 'rb'))
    
else:
    print('No Classification df found, loading the retreived df')
    df = pickle.load(open('../case_report_retrieval/output/retrieved_df/retrieved_df2.p', 'rb'))
    df['include'] = None
    df['test'] = None

No Classification df found, loading the retreived df


In [30]:
# I have written this app to answer two questions:
#     Should this article have been retrived (1/0)?
#     Is it part of the test class or control class (1 = test, ctrl = 2)?
# this is a two tier approach, general inclusion and exclusion then refined test/control classification

In [31]:
# set your screening questions and preferred answers
q_dict = {'q1':{'question':"Is this artilce relevant to the search query?",
                'answers': ['1','0','b']},
          'q2':{'question':"Should this article be in the test group or control group",
                'answers':['1','0','b']}
         }

In [32]:
# art_count = 0
start_time = time.time()
count = 0
for index, row in df.iterrows():
    count+=1
    if row['test'] != None:
        continue
    else:
        print(f'Current index = {index}') 
        print(f'Article {count} of {len(df)}\n')
        print(f'PMID - {row["pmid"]}')
        print(f'Title\n{row["title"]}\n')
        print(f'Abstract\n{row["abstract"]}\n')
        if row['doi'] != '':
            print(f'https://doi.org/{row["doi"]}')
        else:
            print(f'https://pubmed.ncbi.nlm.nih.gov/{index}/')
        print(f"Journal - {row['journal']}")
        print(f'Pub date = {row["pub_date"]}\n')
        
        
        
        print(f'\nlast record was {df.index[count-2]}\n\n')
        print('-------------------------------------------\n')
        
            
        # is the article a paediatric case report or series about covid 19
        include = ''
        while include.lower() not in q_dict['q1']['answers']:
            include = input(f"{q_dict['q1']['question']} Yes (1), No (0), Break (b)")
        if include.lower() == 'b':
            break
        else:
            df.loc[index, 'include'] = int(include)

        if include == '1':
            # is the article severe or mild/mod
            test = ''
            while test.lower() not in q_dict['q2']['answers']:
                test = input(f"{q_dict['q2']['question']} Yes (1), No (0) or break (b)")
            if test.lower() == 'b':
                break
            else:
                df.loc[index, 'test'] = int(test)
                # keep track of how many i've done
                art_count += 1
        else:
            df.loc[index, 'test'] = 0
            art_count += 1
              
    clear_output()
clear_output()
              
# how long have I been screening?
end_time = time.time()
session_time = end_time - start_time
session_hours = np.round(session_time/3600,3)

try:
    print(f'You checked {art_count} articles in this {int(session_hours*60,)} minute session, good work!')
    print(f'Articles per hour in this session= {int(art_count/session_hours)}')

    remaining = len(df)-count
    print(f'Total articles screened = {count}')
    print(f'Total remaining = {remaining}') 
    print(f'Projected time to completion = {np.round(remaining/(int(art_count/session_hours)),1)} hours')
except:
    print('Session ended')

# write the working df to excel and pickle
pickle.dump(df, open('./class_df.p', 'wb'))
            

You checked 26 articles in this 0 minute session, good work!
Articles per hour in this session= 5200
Total articles screened = 11
Total remaining = 1757
Projected time to completion = 0.3 hours


## Screening Summary

In [34]:
df_summary = {}
condition = [val in [0.0, 1.0]  for val in df['include']]
sum_df = df[condition]
print(f'{len(sum_df)} Articles Screened of {len(df)} = {np.round(len(sum_df)/len(df),3)}')
p_d = dict(Counter(sum_df['include']))
test_d = dict(Counter(sum_df['test']))


inc_rate = np.round(p_d[1.0]/len(sum_df),3)
print(f'\nInclusion Rate = {inc_rate}')

print(f'''
Severity proportions (Total (Included)):
Test = {np.round(test_d[1]/len(sum_df),3)} ({np.round(test_d[1]/p_d[1.0],3)})
Control = {np.round(test_d[0]/len(sum_df),3)} ({np.round(test_d[1]/p_d[1.0],3)})
''')

10 Articles Screened of 1768 = 0.006

Inclusion Rate = 0.6

Severity proportions (Total (Included)):
Test = 0.3 (0.5)
Control = 0.7 (0.5)

