# LifeWatch publication - General data completeness 

This jupyter notebook contains general statistics about data completeness of the scientific publications within the LifeWatch project.

Import packages and load the data

In [35]:
# Load necessary libraries and functions
import sys
import os
import chardet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [36]:
# Select the data file
name_datafile = "test4.csv"

# Read data
location_standdata = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), "LW_publications_standardized", name_datafile)
data = pd.read_csv(location_standdata)
data.rename(columns={'SortDate': 'Year'}, inplace=True)
#print(data.columns)

## Data completeness

#### General data statistics:

In [67]:
#Inspect the first 10 rows:
data.head(10)

Unnamed: 0.1,Unnamed: 0,BrefID,Year,BibLvlCode,StandardTitle,RefStrFull,AbstractEnglish,Refstringauthors,WoScode,FullAut,...,wos_keywords,wos_plus_keywords,wos_categories,wos_researcharea,stand_affil,similarity_method,stand_country,stand_flemish,stand_GROUP,stand_QH
0,0,213282,2012,MS,Book of abstracts - VLIZ Young Marine Scientis...,"<b>Mees, J.; Seys, J. (Ed.)</b> (2012). Book o...",,"Mees, J.; Seys, J. (Ed.)",,"Seys, Jan, J.",...,,,,,Flanders Marine Institute (VLIZ),,,,,
1,1,213296,2012,AMS,How many known species in the Ocean and in WoRMS?,"<b>Appeltans, W.; Costello, M.J.; Decock, W.; ...",,"Appeltans, W.; Costello, M.J.; Decock, W.; Van...",,"Decock, Wim, W.",...,,,,,Flanders Marine Institute (VLIZ),,,,,
2,2,213337,2012,AMS,Talking to the WoRMS: what can VLIZ web servic...,"<b>Deneudt, K.; Vanhoorne, B.; Appeltans, W.; ...",,"Deneudt, K.; Vanhoorne, B.; Appeltans, W.; Her...",,"Vanhoorne, Bart, B.",...,,,,,Flanders Marine Institute (VLIZ),,,,,
3,3,213372,2012,AMS,The Belgian Register of Marine Species - BeRMS,<b>VLIZ Belgian Marine Species Consortium</b> ...,,VLIZ Belgian Marine Species Consortium,,VLIZ Belgian Marine Species Consortium,...,,,,,Flanders Marine Institute (VLIZ),,,,,
4,4,215501,2012,AS,Global diversity of sponges (Porifera),"<b>Van Soest, R.W.M.; Boury-Esnault, N.; Vacel...",With the completion of a single unified classi...,"Van Soest, R.W.M.; Boury-Esnault, N.; Vacelet,...",WOS:000305336000024,"Vanhoorne, Bart, B.",...,,NORTH-EAST ATLANTIC; CALCAREOUS SPONGES; ASBES...,Multidisciplinary Sciences,Science & Technology - Other Topics,VLIZ Flanders Marine Institute,x,Belgium,x,Research institute,Science
5,5,215650,2012,AS,Quantifying the global wave power resource,"<b>Gunn, K.; Stock-Williams, C.</b> (2012). Qu...",Justifying continued development and large-sca...,"Gunn, K.; Stock-Williams, C.",WOS:000302821800034,"Gunn, Kester, K.",...,Wave; Wave energy; Wave power; Wave resource; ...,ENERGY; OCEAN,Green & Sustainable Science & Technology; Ener...,Science & Technology - Other Topics; Energy & ...,E.ON New Build & Technology Ltd.,,UK,,Company,Industry
6,6,215652,2012,AS,"Seabird conservation status, threats and prior...","<b>Croxall, J.P.; Butchart, S.H.M.; Lascelles,...","We review the conservation status of, and thre...","Croxall, J.P.; Butchart, S.H.M.; Lascelles, B....",WOS:000301298900001,"Croxall, J.P.",...,,BIODIVERSITY; ERADICATION; ALBATROSSES; INDICA...,Biodiversity Conservation; Ornithology,Biodiversity & Conservation; Zoology,BirdLife International,,Global,,Non-profit organization,Civil society
7,7,215780,2012,AS,First record of the pelagic fish species blue ...,"<b>Van Ginderdeuren, K.; Hoffman, S.; Vandendr...",,"Van Ginderdeuren, K.; Hoffman, S.; Vandendries...",WOS:000304055400009,"Van Ginderdeuren, Karl, K.",...,blue whiting; Micromesistius poutassou; Belgia...,ENGLISH-CHANNEL; NORWAY POUT,Zoology,Zoology,Ghent University - Faculty of Sciences - Biolo...,,Belgium,x,Research institute,Science
8,8,215782,2012,AS,Updating the zooplankton species list for the ...,"<b>Van Ginderdeuren, K.; Fiers, F.; De Backer,...","Many marine species are threatened, and given ...","Van Ginderdeuren, K.; Fiers, F.; De Backer, A....",WOS:000304055400001,"De Backer, Annelies, A.",...,zooplankton; marine biodiversity; Belgian part...,SCHELDE ESTUARY; NETHERLANDS; ABUNDANCE; WATER,Zoology,Zoology,"Research Institute for Agriculture, Fisheries ...",,,,,
9,9,215856,2012,M,Pliocene Panamanian Gateway tectonics and clim...,"<b>Van Renterghem, C.</b> (2012). Pliocene Pan...",,"Van Renterghem, C.",,"Van Renterghem, Cédéric, C.",...,,,,,,,,,,


In [68]:
#Get all the column names
data.columns

Index(['Unnamed: 0', 'BrefID', 'Year', 'BibLvlCode', 'StandardTitle',
       'RefStrFull', 'AbstractEnglish', 'Refstringauthors', 'WoScode',
       'FullAut', 'Affiliation', 'DOI', 'ownDOI', 'Special Collections',
       'GeoTerms', 'TaxTerms', 'ThesTerms', 'OtherTerms', 'AuthorKeywords',
       'wos_affil', 'wos_country', 'wos_keywords', 'wos_plus_keywords',
       'wos_categories', 'wos_researcharea', 'stand_affil',
       'similarity_method', 'stand_country', 'stand_flemish', 'stand_GROUP',
       'stand_QH'],
      dtype='object')

In [69]:
data.describe()

Unnamed: 0.1,Unnamed: 0,BrefID,Year,ownDOI
count,300.0,300.0,300.0,0.0
mean,149.5,235001.23,2013.433333,
std,86.746758,9569.908181,0.956699,
min,0.0,213282.0,2012.0,
25%,74.75,228189.0,2013.0,
50%,149.5,238013.5,2013.0,
75%,224.25,244260.5,2014.0,
max,299.0,247339.0,2016.0,


In [70]:
stand_data.dtypes

BrefID            int64
stand_affil      object
stand_country    object
stand_flemish    object
stand_GROUP      object
stand_QH         object
dtype: object

### Publications with WoS information available:

In [77]:
#Select 'wos'-columns:
wos_data = data[['BrefID','WoScode', 'wos_affil', 'wos_country', 'wos_keywords', 'wos_plus_keywords', 'wos_categories', 'wos_researcharea']]

In [78]:
#Summary statistics
include = ['int64', 'object']
wosdata_summ = wos_data.describe(include = include)
wosdata_summ

Unnamed: 0,BrefID,WoScode,wos_affil,wos_country,wos_keywords,wos_plus_keywords,wos_categories,wos_researcharea
count,300.0,180,163,163,124,146,163,163
unique,,180,152,37,124,146,64,59
top,,WOS:000360936200025,"Ege Univ, Fac Fisheries, Dept Hydrobiol, Izmir...",Belgium,Human impact; Management plans; Protected area...,NATURAL-GAS; CH4; UNCERTAINTY; FOOTPRINT; DECADES,Zoology,Zoology
freq,,1,4,31,1,1,29,30
mean,235001.23,,,,,,,
std,9569.908181,,,,,,,
min,213282.0,,,,,,,
25%,228189.0,,,,,,,
50%,238013.5,,,,,,,
75%,244260.5,,,,,,,


In [80]:
#Get absolute count:
wosdata_completeness_count = wosdata_summ
wosdata_completeness_count.iloc[0]

BrefID               300
WoScode              180
wos_affil            163
wos_country          163
wos_keywords         124
wos_plus_keywords    146
wos_categories       163
wos_researcharea     163
Name: count, dtype: object

In [81]:
#Get the percentage:
wosdata_completeness_perc = wosdata_summ.divide(wosdata_summ.BrefID, axis=0).multiply(100)
wosdata_completeness_perc.iloc[0]

BrefID                   100
WoScode                   60
wos_affil            54.3333
wos_country          54.3333
wos_keywords         41.3333
wos_plus_keywords    48.6667
wos_categories       54.3333
wos_researcharea     54.3333
Name: count, dtype: object

### Publications with standardized information available:

In [71]:
#Select columns with standardized information 
#(note: BrefID is included because this info is always present - therefore complete)
stand_data = data[['BrefID','stand_affil', 'stand_country', 'stand_flemish', 'stand_GROUP', 'stand_QH']]

In [73]:
#Summary statistics
include = ['int64', 'object']
standdata_summ = stand_data.describe(include = include)
standdata_summ

Unnamed: 0,BrefID,stand_affil,stand_country,stand_flemish,stand_GROUP,stand_QH
count,300.0,222,160,56,160,160
unique,,123,32,2,11,4
top,,Flanders Marine Institute (VLIZ),Belgium,x,Research institute,Science
freq,,34,56,52,85,148
mean,235001.23,,,,,
std,9569.908181,,,,,
min,213282.0,,,,,
25%,228189.0,,,,,
50%,238013.5,,,,,
75%,244260.5,,,,,


In [74]:
#Get absolute count:
standdata_completeness_count = standdata_summ
standdata_completeness_count.iloc[0]

BrefID           300
stand_affil      222
stand_country    160
stand_flemish     56
stand_GROUP      160
stand_QH         160
Name: count, dtype: object

In [82]:
#Get the percentage:
standdata_completeness_perc = standdata_summ.divide(standdata_summ.BrefID, axis=0).multiply(100)
standdata_completeness_perc.iloc[0]

BrefID               100
stand_affil           74
stand_country    53.3333
stand_flemish    18.6667
stand_GROUP      53.3333
stand_QH         53.3333
Name: count, dtype: object