# LifeWatch publication - General data completeness 

This jupyter notebook contains general statistics about data completeness of the scientific publications within the LifeWatch project.

Import packages and load the data

In [1]:
# Load necessary libraries and functions
import sys
import os
import chardet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Select the data file
name_datafile = "TEST08122021.csv"

# Read data
location_standdata = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), "LW_publications_standardized", name_datafile)
data = pd.read_csv(location_standdata)
data.rename(columns={'SortDate': 'Year'}, inplace=True)
#print(data.columns)

## Data completeness

### Publications with affiliation information available, either in IMIS or from the WoS-export:

In [9]:
#Select affiliation related columns:
affil_data = data[['BrefID', 'Affiliation', 'wos_affil']]

In [40]:
#number of publication with affiliation information available in IMIS, and with affiliation info available from WoS
affil_data_count = affil_data.count()
affil_data_count

BrefID         499
Affiliation    286
wos_affil      279
dtype: int64

In [41]:
#percentage of publication with affiliation information available in IMIS, and with affiliation info available from WoS
affil_data_perc = affil_data_count.divide(affil_data_count.BrefID, axis=0).multiply(100)
affil_data_perc

BrefID         100.000000
Affiliation     57.314629
wos_affil       55.911824
dtype: float64

#### Note: Special collection 'BioOracle' 
This special collection was only recently added to IMIS - hence very little affiliation information and WoS codes are available

In [52]:
#select publications of the special collection
spcol_pubs = data[data['spcolNames'].str.contains('Bio-ORACLE')==True]

In [41]:
#percentage of publication with affiliation information available in IMIS, and with affiliation info available from WoS
spcol_pubs_count = data[['BrefID', 'Affiliation', 'wos_affil']].count()
affil_data_perc = affil_data_count.divide(affil_data_count.BrefID, axis=0).multiply(100)
affil_data_perc

BrefID         100.000000
Affiliation     57.314629
wos_affil       55.911824
dtype: float64

### Publications with WoS information available:

In [43]:
#Select 'wos'-columns:
wos_data = data[['BrefID','WoScode', 'wos_affil', 'wos_country', 'wos_keywords', 'wos_plus_keywords', 'wos_categories', 'wos_researcharea']]

In [44]:
#Summary statistics
include = ['int64', 'object']
wosdata_summ = wos_data.describe(include = include)
#wosdata_summ

In [45]:
#Get absolute count:
wosdata_completeness_count = wosdata_summ
wosdata_completeness_count.iloc[0]

BrefID               499
WoScode              305
wos_affil            279
wos_country          279
wos_keywords         206
wos_plus_keywords    255
wos_categories       279
wos_researcharea     279
Name: count, dtype: object

In [46]:
#Get the percentage:
wosdata_completeness_perc = wosdata_summ.divide(wosdata_summ.BrefID, axis=0).multiply(100)
wosdata_completeness_perc.iloc[0]

BrefID                   100
WoScode              61.1222
wos_affil            55.9118
wos_country          55.9118
wos_keywords         41.2826
wos_plus_keywords    51.1022
wos_categories       55.9118
wos_researcharea     55.9118
Name: count, dtype: object

### Publications with standardized information available:

In [47]:
#Select columns with standardized information 
#(note: BrefID is included because this info is always present - therefore complete)
stand_data = data[['BrefID','stand_affil', 'stand_country', 'stand_flemish', 'stand_GROUP', 'stand_QH']]

In [48]:
#Summary statistics
include = ['int64', 'object']
standdata_summ = stand_data.describe(include = include)
#standdata_summ

In [49]:
#Get absolute count:
standdata_completeness_count = standdata_summ
standdata_completeness_count.iloc[0]

BrefID           499
stand_affil      359
stand_country    263
stand_flemish     78
stand_GROUP      263
stand_QH         263
Name: count, dtype: object

In [50]:
#Get the percentage:
standdata_completeness_perc = standdata_summ.divide(standdata_summ.BrefID, axis=0).multiply(100)
standdata_completeness_perc.iloc[0]

BrefID               100
stand_affil      71.9439
stand_country    52.7054
stand_flemish    15.6313
stand_GROUP      52.7054
stand_QH         52.7054
Name: count, dtype: object