# LifeWatch publication - General data completeness 

This jupyter notebook contains general statistics about data completeness of the scientific publications within the LifeWatch project.

Import packages and load the data

In [1]:
# Load necessary libraries and functions
import sys
import os
import chardet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Select the data file
name_datafile = "TEST08122021.csv"

# Read data
location_standdata = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), "LW_publications_standardized", name_datafile)
data = pd.read_csv(location_standdata)
data.rename(columns={'SortDate': 'Year'}, inplace=True)
#print(data.columns)

## Data completeness

### Publications with affiliation information available, either in IMIS or from the WoS-export:

In [9]:
#Select affiliation related columns:
affil_data = data[['BrefID', 'Affiliation', 'wos_affil']]

In [34]:
#number of publication with affiliation information available in IMIS, and with affiliation info available from WoS
affil_data.count()

BrefID         499
Affiliation    286
wos_affil      279
dtype: int64

In [35]:
#percentage:


BrefID           0
Affiliation    213
wos_affil      220
dtype: int64

### Publications with WoS information available:

In [77]:
#Select 'wos'-columns:
wos_data = data[['BrefID','WoScode', 'wos_affil', 'wos_country', 'wos_keywords', 'wos_plus_keywords', 'wos_categories', 'wos_researcharea']]

In [78]:
#Summary statistics
include = ['int64', 'object']
wosdata_summ = wos_data.describe(include = include)
wosdata_summ

Unnamed: 0,BrefID,WoScode,wos_affil,wos_country,wos_keywords,wos_plus_keywords,wos_categories,wos_researcharea
count,300.0,180,163,163,124,146,163,163
unique,,180,152,37,124,146,64,59
top,,WOS:000360936200025,"Ege Univ, Fac Fisheries, Dept Hydrobiol, Izmir...",Belgium,Human impact; Management plans; Protected area...,NATURAL-GAS; CH4; UNCERTAINTY; FOOTPRINT; DECADES,Zoology,Zoology
freq,,1,4,31,1,1,29,30
mean,235001.23,,,,,,,
std,9569.908181,,,,,,,
min,213282.0,,,,,,,
25%,228189.0,,,,,,,
50%,238013.5,,,,,,,
75%,244260.5,,,,,,,


In [80]:
#Get absolute count:
wosdata_completeness_count = wosdata_summ
wosdata_completeness_count.iloc[0]

BrefID               300
WoScode              180
wos_affil            163
wos_country          163
wos_keywords         124
wos_plus_keywords    146
wos_categories       163
wos_researcharea     163
Name: count, dtype: object

In [81]:
#Get the percentage:
wosdata_completeness_perc = wosdata_summ.divide(wosdata_summ.BrefID, axis=0).multiply(100)
wosdata_completeness_perc.iloc[0]

BrefID                   100
WoScode                   60
wos_affil            54.3333
wos_country          54.3333
wos_keywords         41.3333
wos_plus_keywords    48.6667
wos_categories       54.3333
wos_researcharea     54.3333
Name: count, dtype: object

### Publications with standardized information available:

In [71]:
#Select columns with standardized information 
#(note: BrefID is included because this info is always present - therefore complete)
stand_data = data[['BrefID','stand_affil', 'stand_country', 'stand_flemish', 'stand_GROUP', 'stand_QH']]

In [73]:
#Summary statistics
include = ['int64', 'object']
standdata_summ = stand_data.describe(include = include)
standdata_summ

Unnamed: 0,BrefID,stand_affil,stand_country,stand_flemish,stand_GROUP,stand_QH
count,300.0,222,160,56,160,160
unique,,123,32,2,11,4
top,,Flanders Marine Institute (VLIZ),Belgium,x,Research institute,Science
freq,,34,56,52,85,148
mean,235001.23,,,,,
std,9569.908181,,,,,
min,213282.0,,,,,
25%,228189.0,,,,,
50%,238013.5,,,,,
75%,244260.5,,,,,


In [74]:
#Get absolute count:
standdata_completeness_count = standdata_summ
standdata_completeness_count.iloc[0]

BrefID           300
stand_affil      222
stand_country    160
stand_flemish     56
stand_GROUP      160
stand_QH         160
Name: count, dtype: object

In [82]:
#Get the percentage:
standdata_completeness_perc = standdata_summ.divide(standdata_summ.BrefID, axis=0).multiply(100)
standdata_completeness_perc.iloc[0]

BrefID               100
stand_affil           74
stand_country    53.3333
stand_flemish    18.6667
stand_GROUP      53.3333
stand_QH         53.3333
Name: count, dtype: object

In [None]:
#(note: why not 100%?)