In [1]:
import re
import json
import boto3
import requests
from datetime import datetime, timedelta
from random import sample
from multiprocessing import Pool as ThreadPool

In [2]:
#Days
age = 4
#Hours
offset = 1
bucket_name = "com-elsevier-recs-live-reviewers"

In [3]:
def get_acronym(manuscript_id):
    base = manuscript_id.split("_201")[0]
    return base.split("-")[0]

def trim(manuscript_id):
    match = re.findall("(_[R|V][0-9]+)", manuscript_id)
    if match:
        for i in match:
            manuscript_id = manuscript_id.replace(i, "")
    return manuscript_id

In [4]:
def load_from_s3(bucket_object, prefix, days, fixed_current_time, ofs=0):
    return [i.key.split("/")[1] for i in bucket_object.objects.filter(Prefix=prefix)
            if is_recent(i.last_modified, days, fixed_current_time, ofs)]

def is_recent(lm, n, fixed_current_time, ofs=0):
    last_modified = lm.replace(tzinfo=None)
    age_range = fixed_current_time - timedelta(days=n)
    if ofs == 0:
        return age_range <= last_modified
    else:
        offset = timedelta(hours=ofs)
        return age_range <= last_modified <= fixed_current_time - offset

In [5]:
def inspect(manuscript):
    obj = s3client.get_object(
    Bucket=bucket_name,
    Key="submitted-manuscripts/" + manuscript + ".json"
    )
    return json.loads(obj['Body'].read())
    

def inspect_acronym(manuscript):
    return inspect(manuscript)['journal']['acronym']

def empty_keywords(manuscript_obj):
    try:
        return manuscript_obj['keywords'] == ""
    except(KeyError):
        return True
    
def empty_title(manuscript_obj):
    try:
        return manuscript_obj['title'] == ""
    except(KeyError):
        return True

In [6]:
def allowed_status(manuscript_obj):
    allowed_statuses = ["DECISION_PENDING",
                        "IN_EDITORIALMASTER", 
                        "IN_EDITORIAL_PRP", 
                        "IN_REVIEW", 
                        "RDY_FOR_DCSN", 
                        "REVIEWER_INVITED", 
                        "EDITOR_INVITED", 
                        "NO_EDITOR_INVITED"]
    return manuscript_obj['status'] in set(allowed_statuses)

In [7]:
def lookupSubmission(manuscript):
    manuscript_obj = inspect(manuscript)
    if allowed_status(manuscript_obj):
        if empty_keywords(manuscript_obj):
            return (manuscript, 'EK')
        elif empty_title(manuscript_obj):
            return (manuscript, 'ET')
        else:
            return (manuscript, 'AS')
    else:
        return (manuscript, 'DS')

In [8]:
headers = { 'Accept': 'application/json' }
response = requests.get('https://recs-reviewers-recommender.api.recs.d.elsevier.com/recommendations/journals',
                        headers=headers)
enabled_acronyms = response.json()

In [9]:
with open('data/em_acronyms.txt') as f:
    em_acronyms = [i.strip('\n') for i in f.readlines()]

In [10]:
#Make sure your environment has access to recs-live aws account
s3 = boto3.resource("s3")
s3client = boto3.client('s3')
reviewers = s3.Bucket(bucket_name)

In [11]:
%%time
fixed_current_time = datetime.now()
raw_submitted_mans = load_from_s3(reviewers, 'submitted-manuscripts', age, fixed_current_time, offset)
raw_recommendations = load_from_s3(reviewers, 'recommendations', age, fixed_current_time)

CPU times: user 5min 34s, sys: 6.31 s, total: 5min 41s
Wall time: 17min 35s


In [12]:
%%time
submitted_ids = [i.split(".json")[0] for i in raw_submitted_mans]
recommendation_ids = [i.split("_logistic")[0] for i in raw_recommendations if "_logistic" in i]

CPU times: user 22 ms, sys: 1.8 ms, total: 23.8 ms
Wall time: 22.8 ms


In [13]:
%%time
enabled_submitted_ids = [i for i in submitted_ids if get_acronym(i) in set(enabled_acronyms) - set(['ELSTRAINING1', 'ELSTRAINING2'])]
enabled_generated_recs = [i for i in enabled_submitted_ids if trim(i) in set(recommendation_ids)]
enabled_missing_recs = list(set(enabled_submitted_ids) - set(enabled_generated_recs))

CPU times: user 22.8 s, sys: 74 ms, total: 22.9 s
Wall time: 22.9 s


In [14]:
%%time
pool = ThreadPool(20)
lookups = pool.map(lookupSubmission, enabled_missing_recs)
disabled_status_failures = [manuscript for manuscript, issue in lookups if issue == 'DS']
no_keyword_failures = [manuscript for manuscript, issue in lookups if issue == 'EK']
no_title_failures = [manuscript for manuscript, issue in lookups if issue == 'ET']
unknown_failures = [manuscript for manuscript, issue in lookups if issue == 'AS']

CPU times: user 39.3 ms, sys: 60.3 ms, total: 99.6 ms
Wall time: 10.6 s


In [15]:
%%time
truly_unknown_failures = [i for i in unknown_failures if inspect_acronym(i) in enabled_acronyms]

CPU times: user 19.1 ms, sys: 9.44 ms, total: 28.5 ms
Wall time: 483 ms


In [16]:
def em_filter(collection):
    return [c for c in collection if get_acronym(c) in set(em_acronyms)]

def print_report(em=False):
    if em:
        report = [
        "Over the last {} days excluding last {} hours:".format(age, offset),
        "{} EM manuscripts with enabled acronyms were submitted\
        ".format(len(em_filter(enabled_submitted_ids)) - (len(em_filter(unknown_failures)) - len(em_filter(truly_unknown_failures)))),
        "\n{} EM recommendations were made\
        ".format(len(em_filter(enabled_generated_recs))),
        "{} EM recommendations were rejected because of ineligible status\
        ".format(len(em_filter(disabled_status_failures))),
        "{} EM recommendations were rejected because of no keywords\
        ".format(len(em_filter(no_keyword_failures))),
        "{} EM recommendations were rejected because of no title\
        ".format(len(em_filter(no_title_failures))),
        "{} EM recommendations were missing due to other issues\
        ".format(len(em_filter(truly_unknown_failures)))
        ]
    else:
        report = [
        "Over the last {} days excluding last {} hours:".format(age, offset),
        "{} manuscripts with enabled acronyms were submitted\
        ".format(len(enabled_submitted_ids) - (len(unknown_failures) - len(truly_unknown_failures))),
        "\n{} recommendations were made\
        ".format(len(enabled_generated_recs)),
        "{} recommendations were rejected because of ineligible status\
        ".format(len(disabled_status_failures)),
        "{} recommendations were rejected because of no keywords\
        ".format(len(no_keyword_failures)),
        "{} recommendations were rejected because of no title\
        ".format(len(no_title_failures)),
        "{} recommendations were missing due to other issues\
        ".format(len(truly_unknown_failures))
        ]
    for line in report:
        print(line)

### Overall State

In [17]:
print_report()

Over the last 4 days excluding last 1 hours:
17126 manuscripts with enabled acronyms were submitted        

15918 recommendations were made        
1 recommendations were rejected because of ineligible status        
1206 recommendations were rejected because of no keywords        
0 recommendations were rejected because of no title        
1 recommendations were missing due to other issues        


### EM State

In [18]:
print_report(em=True)

Over the last 4 days excluding last 1 hours:
12348 EM manuscripts with enabled acronyms were submitted        

11448 EM recommendations were made        
0 EM recommendations were rejected because of ineligible status        
899 EM recommendations were rejected because of no keywords        
0 EM recommendations were rejected because of no title        
1 EM recommendations were missing due to other issues        


### Manuscripts for failed recommendations with enabled acronyms

In [19]:
for i in em_filter(truly_unknown_failures):
    print(i + '.json')

ELSTRAINER40-S-20-00039.json


In [20]:
for i in truly_unknown_failures:
    print('"' + i + '",')

"ELSTRAINER40-S-20-00039",


### Sample of failing submissions

In [21]:
no_title = 0
no_keywords = 10
sample([i + '.json' for i in no_title_failures], no_title) +  sample([i + '.json' for i in no_keyword_failures], no_keywords)

['AJEM-S-20-00706.json',
 'JVIR-S-20-00333.json',
 'AJODO-S-20-00706.json',
 'JVN-S-20-00056.json',
 'AJEM-S-20-01717.json',
 'ARAD-S-20-01027.json',
 'IJAA-S-20-01475.json',
 'YJOMS-S-19-02053.json',
 'MEDCLI-S-20-00902.json',
 'AJOG-S-20-00895.json']

In [22]:
def assess_submission(manuscript_id):
    data = inspect(manuscript_id)
    for i in ['title', 'journal', 'keywords']:
        if data[i] == "":
            print("Problem with {}".format(i))
        else:
            print("{}: {}".format(i,data[i]))