# Temporary fixes

## January 4th, 2021: Fixed missing pp_num_authors data

In [1]:
import pandas as pd

# Read in Jan 2nd data with significant missingness in pp_num_authors data
df = pd.read_csv('2021-01-09_flat-file_v1s.psv', sep = '|')
df['pp_num_authors'].value_counts(dropna = False)

NaN      3271
5.0       500
4.0       460
6.0       433
3.0       425
         ... 
63.0        1
64.0        1
52.0        1
45.0        1
105.0       1
Name: pp_num_authors, Length: 64, dtype: int64

In [2]:
df.shape[0]

7547

In [3]:
import json
import requests
from tqdm import tqdm

MAX_RETRIES = 5

df['pp_num_authors_revised'] = None

for i, row in tqdm(df.iterrows(), total = df.shape[0]):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            response = [x for x in json.loads(requests.get('https://api.biorxiv.org/details/%s/%s/na/json' % (df['server'][i], df['doi'][i])).text)['collection'] if x["version"] == "1"][0]
            if 'authors' in response:
                if len(response['authors']) > 0 and response['authors'][-1] == ';':
                    df['pp_num_authors_revised'][i] = len(response['authors'][:-1].split(';'))
                else:
                    df['pp_num_authors_revised'][i] = len(response['authors'].split(';'))
            break
        except:
            retries += 1
        print('Error with DOI %s, retrying...' % df['doi'][i])

100%|██████████| 7547/7547 [19:44<00:00,  6.37it/s]


In [4]:
df.drop(columns = ['pp_num_authors'], inplace = True)
df.rename(columns = {"pp_num_authors_revised": "pp_num_authors"}, inplace=True)

In [5]:
# Fixed!
df['pp_num_authors'].value_counts(dropna = False)

5     877
4     864
6     805
3     745
7     658
2     563
8     531
9     448
10    357
11    252
12    239
13    166
14    146
15    134
16     96
1      95
17     77
18     77
19     55
20     39
22     39
23     30
21     27
24     26
25     23
26     21
27     20
30     14
28     12
29     10
31     10
43      9
37      8
34      8
38      7
36      6
33      6
32      6
46      6
35      5
41      5
44      5
45      5
42      4
39      3
40      3
48      3
47      1
49      1
Name: pp_num_authors, dtype: int64

In [6]:
from datetime import datetime

# Save out data to new flat file
todays_date = datetime.strftime(datetime.today(), "%Y-%m-%d")
df.to_csv(todays_date + '_flat-file_v1s.psv', sep='|', index=False)