# EXPLORE SHARE AND OSF PREPRINT API DATES

1. [Read date subset](#read)
2. [Check number of records w/ each date field](#check-number)
    1. [OSF](#check-number-osf)
    2. [SHARE](#check-number-share)
3. [Explore OSF dates](#explore-osf)
    1. [date_created vs date_published](#td-dc-dp)
    2. [preprint_doi_created vs date_published](#td-pdc-dp)
    3. [date_modified vs date published](#dm-dp)
    4. [original_publication_date vs date published](#opd-dp)
    5. [Records without preprint_doi_created](#pdc)
4. [Map OSF and SHARE dates](#mapping)

In [131]:
# ALWAYS RUN THIS BLOCK OF CODE FIRST

import os

SHARE_OSF_OUTPUT_FILE = os.path.join('..', '..', 'data', 'share_osf_dates.csv')

OSF_PREFIX = 'osf_'
SHARE_PREFIX = 'share_'

OSF_DATE_FIELDS = ['osf_date_last_transitioned', 
                  'osf_date_modified',
                  'osf_original_publication_date',
                  'osf_date_published',
                  'osf_date_withdrawn', 
                  'osf_preprint_doi_created',
                  'osf_date_created']

SHARE_DATE_FIELDS = ['share_date',
                  'share_date_created',
                  'share_date_modified',
                  'share_date_updated',
                  'share_date_published']

## <a id='read'>READ DATA</a>

In [132]:
import csv

data = {}
with open(SHARE_OSF_OUTPUT_FILE, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        data[row['id']] = row

## <a id='check-number'>CHECK HOW MANY RECORDS HAVE VALUES PER DATE FIELD</a>

### <a id='check-number-osf'>OSF</a>

In [133]:
import pandas as pd
from collections import defaultdict

records_with_osf_dates = defaultdict(int)

for id_, row in data.items():
    for field in OSF_DATE_FIELDS:
        if row[field] != '':
            records_with_osf_dates[field.replace(OSF_PREFIX, '')] += 1

pd.DataFrame(sorted(records_with_osf_dates.items(), key = lambda kv : kv[1], reverse = True), 
             columns = ['Field', 'Records'])

Unnamed: 0,Field,Records
0,date_modified,21186
1,date_published,21186
2,date_created,21186
3,date_last_transitioned,21176
4,preprint_doi_created,21172
5,original_publication_date,6153
6,date_withdrawn,10


### <a id='check-number-share'>SHARE</a>

In [134]:
import pandas as pd
from collections import defaultdict

records_with_share_dates = defaultdict(int)

for id_, row in data.items():
    for field in SHARE_DATE_FIELDS:
        try:
            if row[field] != '':
                records_with_share_dates[field.replace(SHARE_PREFIX, '')] += 1
        except:
            input(row)
            
pd.DataFrame(sorted(records_with_share_dates.items(), key = lambda kv : kv[1], reverse = True),
            columns = ['Field', 'Records'])

Unnamed: 0,Field,Records
0,date,21186
1,date_created,21186
2,date_modified,21186
3,date_updated,21186
4,date_published,21186


### <a id='explore-osf'>EXPLORE OSF DATES</a>

In [135]:
# parse date string and ignore the microseconds

from datetime import datetime

SECONDS_IN_A_DAY = 86400
SECONDS_IN_AN_HOUR = 3600

def parse_date(datestr):
    try:
        dt = datetime.strptime(datestr, '%Y-%m-%dT%H:%M:%S.%f')
        return dt.replace(microsecond=0)
    except:
        return datetime.strptime(datestr, '%Y-%m-%dT%H:%M:%S')

### <a id='td-dc-dp'>TIME DIFFERENCE BETWEEN DATE_CREATED AND DATE_PUBLISHED</a>

In [136]:
# save the time difference into bins so we can calculate per bin and for the whole dataset

import pandas as pd

date_differences_bins = {
    'equal' : 0,
    'less than a minute' : [],
    'less than an hour' : [],
    'less than 24 hours' : [],
    'more than 1 day' : [],
}

records_with_more_than_1_year_diff = {}

for id_, row in data.items():
    date_created = parse_date(row['osf_date_created'])
    date_published = parse_date(row['osf_date_published'])

    if date_created < date_published:
        date_diff = (date_published - date_created)
        date_diff_total_seconds = date_diff.days * SECONDS_IN_A_DAY + date_diff.seconds

        if date_diff.days > 0:
            date_differences_bins['more than 1 day'].append(date_diff_total_seconds)
            records_with_more_than_1_year_diff[date_diff_total_seconds] = row['id']

        elif date_diff.seconds < 60:
            date_differences_bins['less than a minute'].append(date_diff_total_seconds)
        elif date_diff.seconds < 3600:
            date_differences_bins['less than an hour'].append(date_diff_total_seconds)
        else:
            date_differences_bins['less than 24 hours'].append(date_diff_total_seconds)

    elif date_created == date_published:
        date_differences_bins['equal'] += 1
    else:
        # this should not happen
        # modify code in case this happens
        print('date_published before date_created')

In [137]:
# get the number of records per bin

records_per_bin = {}
for bin_, records in date_differences_bins.items():
    if isinstance(records, list):
        records_per_bin[bin_] = len(records)
    else:
        records_per_bin[bin_] = records
        
pd.DataFrame(sorted(records_per_bin.items()), columns = ['Date Difference', 'Records'])

Unnamed: 0,Date Difference,Records
0,equal,803
1,less than 24 hours,1575
2,less than a minute,804
3,less than an hour,17569
4,more than 1 day,435


In [138]:
# calculate and print time difference statistics
# these statistics do not include records with 0 time diff (i.e. date_created and date_published are equal)

import statistics

# get statistics for records w/ more than 1 day diff
min_diff = min(date_differences_bins['more than 1 day']) / SECONDS_IN_A_DAY
print('Mininum difference (records w/ more than 1 day diff): {:.2f} days'.format(min_diff))

max_diff = max(date_differences_bins['more than 1 day']) / SECONDS_IN_A_DAY
print('Maximum difference (records w/ more than 1 day diff): {:.2f} days'.format(max_diff))

median_diff = statistics.median(date_differences_bins['more than 1 day']) / SECONDS_IN_A_DAY
print('Median difference (records w/ more than 1 day diff): {:.2f} days'.format(median_diff))

avg_diff = sum(date_differences_bins['more than 1 day'])/ len(date_differences_bins['more than 1 day'])
avg_day_diff = avg_diff / SECONDS_IN_A_DAY
print('Average difference (records w/ more than 1 day diff): {:.2f} days'.format(avg_day_diff))  
        
# get avg time diff for all records
time_differences = []
for bin_, records in date_differences_bins.items():
    if isinstance(records, list):
        time_differences += records
        
avg_diff = sum(time_differences)/ len(time_differences)
avg_hour_diff = avg_diff / 3600
print('Average difference (all records): {:.2f} hours'.format(avg_hour_diff))        

Mininum difference (records w/ more than 1 day diff): 1.00 days
Maximum difference (records w/ more than 1 day diff): 244.40 days
Median difference (records w/ more than 1 day diff): 2.64 days
Average difference (records w/ more than 1 day diff): 7.77 days
Average difference (all records): 4.68 hours


In [139]:
# print the top 5 records with the highest time difference

for diff, id_ in sorted(records_with_more_than_1_year_diff.items(), key = lambda kv : kv[0], reverse = True)[:5]:
    print(id_)

845e2
zvm9a
3q9tz
f92wc
2ubyj


### <a id='td-pdc-dp'>TIME DIFFERENCE BETWEEN PREPRINT_DOI_CREATED AND DATE_PUBLISHED</a>

In [140]:
# save the time difference into bins so we can calculate per bin and for the whole dataset
# bins are different that the ones above

import pandas as pd

dp_and_pdc_equal = 0

date_differences_before_bins = {
    'less than a minute' : [],
    'less than an hour' : [],
    'less than 24 hours' : [],
    '1 - 7 days' : [],
    '8 - 30 days' : [],
    '31 - 365 days' : [],
    'more than 1 year' : []
}

# year published of records in the above bin
year_published = {
    'less than a minute' : [],
    'less than an hour' : [],
    'less than 24 hours' : [],
    '1 - 7 days' : [],
    '8 - 30 days' : [],
    '31 - 365 days' : [],
    'more than 1 year' : []
}

# year preprint doi created of records in the above bin
year_preprint_doi_created = {
    'less than a minute' : [],
    'less than an hour' : [],
    'less than 24 hours' : [],
    '1 - 7 days' : [],
    '8 - 30 days' : [],
    '31 - 365 days' : [],
    'more than 1 year' : []
}

date_differences_after_bins = {
    'less than a minute' : [],
    'less than an hour' : [],
    'less than 24 hours' : [],
    'more than 1 day' : [],
}

records_with_more_than_1_year_diff = []
years = []
     
        
for id_, row in data.items():
    if row['osf_preprint_doi_created'].strip() != '':
        preprint_doi_created = parse_date(row['osf_preprint_doi_created'])
        date_published = parse_date(row['osf_date_published'])

        if date_published < preprint_doi_created:
            date_diff = (preprint_doi_created - date_published)
            date_diff_total_seconds = date_diff.days * SECONDS_IN_A_DAY + date_diff.seconds

            if date_diff.days > 365:
                date_differences_before_bins['more than 1 year'].append(date_diff_total_seconds)
                year_published['more than 1 year'].append(date_published.year)
                year_preprint_doi_created['more than 1 year'].append(preprint_doi_created.year)

                records_with_more_than_1_year_diff.append(row)
            elif date_diff.days > 30:
                date_differences_before_bins['31 - 365 days'].append(date_diff_total_seconds)
                year_published['31 - 365 days'].append(date_published.year)
                year_preprint_doi_created['31 - 365 days'].append(preprint_doi_created.year)

            elif date_diff.days > 7:
                date_differences_before_bins['8 - 30 days'].append(date_diff_total_seconds)
                year_published['8 - 30 days'].append(date_published.year)
                year_preprint_doi_created['8 - 30 days'].append(preprint_doi_created.year)

            elif date_diff.days > 0:
                date_differences_before_bins['1 - 7 days'].append(date_diff_total_seconds)
                year_published['1 - 7 days'].append(date_published.year)
                year_preprint_doi_created['1 - 7 days'].append(preprint_doi_created.year)

            elif date_diff.seconds < 60:
                date_differences_before_bins['less than a minute'].append(date_diff_total_seconds)
                year_published['less than a minute'].append(date_published.year)
                year_preprint_doi_created['less than a minute'].append(preprint_doi_created.year)

            elif date_diff.seconds < 3600:
                date_differences_before_bins['less than an hour'].append(date_diff_total_seconds)
                year_published['less than an hour'].append(date_published.year)
                year_preprint_doi_created['less than an hour'].append(preprint_doi_created.year)

            else:
                date_differences_before_bins['less than 24 hours'].append(date_diff_total_seconds)
                year_published['less than 24 hours'].append(date_published.year)
                year_preprint_doi_created['less than 24 hours'].append(preprint_doi_created.year)

        elif preprint_doi_created == date_published:
            dp_and_pdc_equal += 1
        else:
            # this should not happen but we found cases of this

            date_diff = (date_published - preprint_doi_created)
            date_diff_total_seconds = date_diff.days * SECONDS_IN_A_DAY + date_diff.seconds

            if date_diff.days > 0:
                date_differences_after_bins['more than 1 day'].append(date_diff_total_seconds)
            elif date_diff.seconds < 60:
                date_differences_after_bins['less than a minute'].append(date_diff_total_seconds)
            elif date_diff.seconds < 3600:
                date_differences_after_bins['less than an hour'].append(date_diff_total_seconds)
            else:
                date_differences_after_bins['less than 24 hours'].append(date_diff_total_seconds)

For records where date_published is before preprint_doi_created:

In [141]:
records_per_bin = {}
for bin_, records in date_differences_before_bins.items():
    records_per_bin[bin_] = len(records)
        
pd.DataFrame(sorted(records_per_bin.items()), columns = ['Date Difference', 'Records'])

Unnamed: 0,Date Difference,Records
0,1 - 7 days,202
1,31 - 365 days,9010
2,8 - 30 days,670
3,less than 24 hours,463
4,less than a minute,6900
5,less than an hour,1121
6,more than 1 year,2676


For records where date_published is after preprint_doi_created:

In [142]:
records_per_bin = {}
for bin_, records in date_differences_after_bins.items():
    records_per_bin[bin_] = len(records)
        
pd.DataFrame(sorted(records_per_bin.items()), columns = ['Date Difference', 'Records'])

Unnamed: 0,Date Difference,Records
0,less than 24 hours,16
1,less than a minute,0
2,less than an hour,4
3,more than 1 day,110


In [143]:
dp_and_pdc_equal

0

In [144]:
# print some samples of records with preprint_doi_created > 1 year after date_published 

for i in range(5):
    print(records_with_more_than_1_year_diff[i]['id'])
    print(records_with_more_than_1_year_diff[i]['osf_preprint_doi_created'])
    print(records_with_more_than_1_year_diff[i]['osf_date_published'])

7dvw4
2018-07-02T10:59:28.207038
2017-05-31T23:14:54.781307
vn4bh
2018-07-02T11:01:30.064294
2017-07-01T00:28:13.612690
qcsbz
2018-07-02T11:01:30.203625
2017-07-01T09:25:05.262345
qpe8z
2018-07-02T11:01:30.151148
2017-07-01T05:19:17.160668
b9mvc
2018-07-02T11:01:30.107911
2017-07-01T03:14:26.445410


In [145]:
# for each bin, print the years when records were published

import numpy as np
for bin_, years in year_published.items():
    print('{} - {}'.format(bin_, np.unique(years)))

less than a minute - [2018 2019]
less than an hour - [2018 2019]
less than 24 hours - [2018 2019]
1 - 7 days - [2018]
8 - 30 days - [2018]
31 - 365 days - [2017 2018]
more than 1 year - [2016 2017]


In [146]:
# for each bin, print the years when preprint dois were created for the records in that bin

import numpy as np
for bin_, years in year_preprint_doi_created.items():
    print('{} - {}'.format(bin_, np.unique(years)))

less than a minute - [2018 2019]
less than an hour - [2018 2019]
less than 24 hours - [2018 2019]
1 - 7 days - [2018]
8 - 30 days - [2018]
31 - 365 days - [2018]
more than 1 year - [2018 2019]


--> The results above seem to indicate that preprint_dois were minted starting in 2018

### <a id='dm-dp'>DIFFERENCES IN DATE_MODIFIED AND DATE_PUBLISHED VALUES</a>

In [179]:
dm_dp_values = { 'exactly the same' : 0,
                'same day' : 0,
                'dm before dp' : 0,
                'dm after dp' : 0}

for id_, row in data.items():
    date_modified = parse_date(row['osf_date_modified'])
    date_published = parse_date(row['osf_date_published'])

    if date_modified == date_published:
        dm_dp_values['exactly the same'] += 1

    elif date_modified.year == date_published.year \
        and date_modified.month == date_published.month \
        and date_modified.day == date_published.day:

        dm_dp_values['same day'] += 1

    elif date_modified < date_published:
        dm_dp_values['dm before dp'] += 1

    else:
        dm_dp_values['dm after dp'] += 1

In [180]:
dm_dp_values

{'exactly the same': 53,
 'same day': 5900,
 'dm before dp': 0,
 'dm after dp': 15233}

### <a id='opd-dp'>DIFFERENCES IN ORIGINAL_PUBLICATION_DATE AND DATE_PUBLISHED VALUES</a>

In [176]:
opd_dp_values = { 'exactly the same' : 0,
                'same day' : 0,
                'opd before dp' : 0,
                'opd after dp' : 0}

for id_, row in data.items():
    if row['osf_original_publication_date'] != '':
        original_publication_date = parse_date(row['osf_original_publication_date'])
        date_published = parse_date(row['osf_date_published'])

        if original_publication_date == date_published:
            opd_dp_values['exactly the same'] += 1

        elif original_publication_date.year == date_published.year \
            and original_publication_date.month == date_published.month \
            and original_publication_date.day == date_published.day:

            opd_dp_values['same day'] += 1
            
        elif original_publication_date < date_published:
            opd_dp_values['opd before dp'] += 1
        
        else:
            opd_dp_values['opd after dp'] += 1

In [177]:
opd_dp_values

{'exactly the same': 0,
 'same day': 405,
 'opd before dp': 5477,
 'opd after dp': 271}

### <a id='pdc'>RECORDS WITHOUT PREPRINT_DOI_CREATED</a>

In [164]:
records_no_pdc_and_withdrawn = []
records_no_pdc_not_withdrawn = []

for id_, row in data.items():
    if row['osf_preprint_doi_created'].strip() == '':
        if row['osf_date_withdrawn'].strip() != '':
            records_no_pdc_and_withdrawn.append(id_)
        else:
            records_no_pdc_not_withdrawn.append(id_)

In [165]:
records_no_pdc_and_withdrawn

['7mnjs',
 'bdqja',
 '58pnc',
 '4cxzu',
 'qwr6n',
 '7vm5w',
 '6eq84',
 'hvn54',
 '7gq24',
 'j2wxk']

In [166]:
records_no_pdc_not_withdrawn

['kaxn5', 'uxbvt', 'sm29r', 'xvyjk']

### <a id='mapping'>MAP OSF AND SHARE DATES</a>

In [161]:
# for each record and SHARE field, get which OSF fields match that SHARE field
# count the number of records per SHARE field-OSF field mapping

from collections import defaultdict

records_with_different_date_published = []

for share_date_field in SHARE_DATE_FIELDS:
    mapping = defaultdict(int)
    
    for id_, row in data.items():
        row_field_mapping = []
        
        # get the OSF fields that match the SHARE field up to the seconds value
        for osf_date_field in OSF_DATE_FIELDS:
            # 19 is the date string length until the seconds (essentially remove microseconds)
            if row[share_date_field][:19] == row[osf_date_field][:19]:
                row_field_mapping.append(osf_date_field.replace(OSF_PREFIX, ''))

        # if there are no matching fields,
        # get the OSF fields that match the SHARE field up to the same day value        
        if len(row_field_mapping) == 0:
            for osf_date_field in OSF_DATE_FIELDS:
                # 10 is the date string length until the day (essentially remove hours, etc.)
                if row[share_date_field][:10] == row[osf_date_field][:10]:
                    row_field_mapping.append('~' + osf_date_field.replace(OSF_PREFIX, ''))

            if len(row_field_mapping) == 0:
                row_field_mapping.append('!= any val')
                
        if share_date_field == 'share_date_published' and 'date_published' not in row_field_mapping:
            records_with_different_date_published.append(id_)
        
        mapping[' '.join(row_field_mapping)] += 1
    
    df = pd.DataFrame(sorted(mapping.items(), key = lambda kv : kv[1], reverse = True),
                     columns = [share_date_field, 'Records'])
    display(df)

Unnamed: 0,share_date,Records
0,date_last_transitioned date_published,17046
1,date_published,3282
2,date_last_transitioned date_published date_cre...,803
3,date_last_transitioned date_modified date_publ...,52
4,~date_created,1
5,!= any val,1
6,~preprint_doi_created ~date_created,1


Unnamed: 0,share_date_created,Records
0,~date_last_transitioned ~date_published ~date_...,8723
1,~date_last_transitioned ~date_modified ~date_p...,4924
2,!= any val,2748
3,~date_published ~preprint_doi_created ~date_cr...,1214
4,~date_last_transitioned ~date_published ~prepr...,913
5,~date_published ~date_created,863
6,~date_modified ~preprint_doi_created,271
7,~date_last_transitioned ~date_modified ~date_p...,246
8,~date_last_transitioned ~date_published,238
9,date_created,221


Unnamed: 0,share_date_modified,Records
0,!= any val,17288
1,~date_last_transitioned ~date_modified ~date_p...,1373
2,~date_last_transitioned ~date_modified,1279
3,~date_modified,685
4,~date_last_transitioned,191
5,~date_last_transitioned ~date_modified ~date_p...,107
6,~date_last_transitioned ~date_published ~prepr...,76
7,date_modified preprint_doi_created,53
8,~date_last_transitioned ~date_modified ~origin...,34
9,~date_last_transitioned ~date_modified ~date_p...,21


Unnamed: 0,share_date_updated,Records
0,date_modified preprint_doi_created,12554
1,!= any val,3921
2,date_modified,1896
3,date_last_transitioned date_modified,1306
4,~date_last_transitioned ~date_modified ~date_p...,437
5,preprint_doi_created,362
6,date_last_transitioned,242
7,~date_modified,127
8,~date_last_transitioned ~date_published ~prepr...,88
9,~date_last_transitioned,79


Unnamed: 0,share_date_published,Records
0,date_last_transitioned date_published,17046
1,date_published,3282
2,date_last_transitioned date_published date_cre...,803
3,date_last_transitioned date_modified date_publ...,52
4,~date_created,1
5,!= any val,1
6,~preprint_doi_created ~date_created,1


In [162]:
records_with_different_date_published

['4uczr', 'cyrzf', 'feaq5']