# General data completeness 

This jupyter notebook contains general statistics about data completeness of information:
1. in the scientific LifeWatch publications  
2. of LifeWatch data-systems.

## 1. LifeWatch publications

In [106]:
# Load necessary libraries and functions
import os
import pandas as pd

In [107]:
# Specify location of data:
datafolder = "LW_publications_standardized" 
datafiles = ["LWpubs_stand_0_500.csv", 
             "LWpubs_stand_500_1000.csv", 
             "LWpubs_stand_1000_1500.csv", 
             "LWpubs_stand_1500_2000.csv", 
             "LWpubs_stand_2000_2500.csv",
             "LWpubs_stand_2500_3000.csv",
             "LWpubs_stand_3000_3500.csv",
             "LWpubs_stand_3500_4000.csv",
             "LWpubs_stand_4000_4500.csv",
             "LWpubs_stand_4500_5000.csv",
             "LWpubs_stand_5000_5500.csv",
             "LWpubs_stand_5500_6000.csv",
             "LWpubs_stand_6000_6227.csv"]

# Load data into single dataframe:
appended_data = []
for datafile in datafiles:
    data_loc = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), datafolder, datafile)
    data = pd.read_csv(data_loc)
    data.rename(columns={'SortDate': 'Year'}, inplace=True)
    appended_data.append(data)
    
data = pd.concat(appended_data)
#data.columns

### A. Publications with affiliation information available, either in IMIS or from the WoS-export:

In [108]:
#Select affiliation related columns:
affil_data = data[['BrefID', 'Affiliation', 'wos_affil']]

In [109]:
#number of publication with affiliation information available in IMIS, and with affiliation info available from WoS
affil_data_count = affil_data.count()
affil_data_count

BrefID         6226
Affiliation    1128
wos_affil      3224
dtype: int64

In [110]:
#percentage of publication with affiliation information available in IMIS, and with affiliation info available from WoS
affil_data_perc = affil_data_count.divide(affil_data_count.BrefID, axis=0).multiply(100)
affil_data_perc

BrefID         100.000000
Affiliation     18.117571
wos_affil       51.782846
dtype: float64

Sidenote: Special collection 'BioOracle' -> This special collection was added only recently to IMIS - hence very little  information is available

In [111]:
#select publications of the special collection
spcol_pubs = data[data['spcolNames'].str.contains('Bio-ORACLE')==True]

In [112]:
#percentage of publication with affiliation information available in IMIS, and with affiliation info available from WoS
spcol_pubs_count = spcol_pubs[['BrefID', 'Affiliation', 'wos_affil']].count()
affil_data_perc = spcol_pubs_count.divide(spcol_pubs_count.BrefID, axis=0).multiply(100)
affil_data_perc

BrefID         100.000000
Affiliation      6.832298
wos_affil        6.211180
dtype: float64

### B. Publications with WoS information available:

In [113]:
#Select 'wos'-columns:
wos_data = data[['BrefID','WoScode', 'wos_affil', 'wos_country', 'wos_keywords', 'wos_plus_keywords', 'wos_categories', 'wos_researcharea']]

In [114]:
#Summary statistics
include = ['int64', 'object']
wosdata_summ = wos_data.describe(include = include)
#wosdata_summ

In [115]:
#Get absolute count:
wosdata_completeness_count = wosdata_summ
wosdata_completeness_count.iloc[0]

BrefID               6226
WoScode              3597
wos_affil            3224
wos_country          3234
wos_keywords         2638
wos_plus_keywords    2981
wos_categories       3234
wos_researcharea     3234
Name: count, dtype: object

In [116]:
#Get the percentage:
wosdata_completeness_perc = wosdata_summ.divide(wosdata_summ.BrefID, axis=0).multiply(100)
wosdata_completeness_perc.iloc[0]

BrefID                   100
WoScode              57.7739
wos_affil            51.7828
wos_country          51.9435
wos_keywords         42.3707
wos_plus_keywords    47.8799
wos_categories       51.9435
wos_researcharea     51.9435
Name: count, dtype: object

### C. Publications with standardized information available:

In [117]:
#Select columns with standardized information 
#(note: BrefID is included because this info is always present - therefore complete)
stand_data = data[['BrefID','stand_affil', 'stand_country', 'stand_flemish', 'stand_GROUP', 'stand_QH']]

In [118]:
#Summary statistics
include = ['int64', 'object']
standdata_summ = stand_data.describe(include = include)
#standdata_summ

In [119]:
#Get absolute count:
standdata_completeness_count = standdata_summ
standdata_completeness_count.iloc[0]

BrefID           6226
stand_affil      3410
stand_country    2977
stand_flemish     354
stand_GROUP      2977
stand_QH         2977
Name: count, dtype: object

In [120]:
#Get the percentage:
standdata_completeness_perc = standdata_summ.divide(standdata_summ.BrefID, axis=0).multiply(100)
standdata_completeness_perc.iloc[0]

BrefID               100
stand_affil      54.7703
stand_country    47.8156
stand_flemish    5.68583
stand_GROUP      47.8156
stand_QH         47.8156
Name: count, dtype: object

## 2.LifeWatch Data-Systems

In [123]:
# Load necessary libraries and functions
import os
import pandas as pd

In [124]:
# Specify location of data:
datafile = "LW_datasystems_stand.csv"
datafolder = "LW_data_systems_standardized" 
# Load data
data_loc = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), datafolder, datafile)
data = pd.read_csv(data_loc)
data.rename(columns={'SortDate': 'Year'}, inplace=True)
#print(data.columns)

### A. Data-systems with missing standardized affiliation information

In [125]:
# Select usefull data - for general overview
data_affil = data[['DataSystem','Affiliation', 'stand_affil', 'stand_country', 'stand_flemish', 'stand_GROUP', 'stand_QH']]

In [126]:
#summary of dataframe (the number of rows with info for the specific column):
data_affil_summ = data_affil.describe()
data_affil_summ

Unnamed: 0,DataSystem,Affiliation,stand_affil,stand_country,stand_flemish,stand_GROUP,stand_QH
count,737,735,711,615,140,615,615
unique,3,553,469,50,2,20,4
top,Marine_species,VLIZ,VLIZ Flanders Marine Institute,Belgium,x,Research institute,Science
freq,363,49,49,144,120,205,422


In [127]:
# Select the rows without stand_affil info available -> to get idea why not standardized:
missing_standaffil = data_affil[data_affil['stand_affil'].isnull()]
missing_standaffil

Unnamed: 0,DataSystem,Affiliation,stand_affil,stand_country,stand_flemish,stand_GROUP,stand_QH
116,Marine_species,,,,,,
222,Marine_species,,,,,,
257,Marine_species,cefas,,,,,
278,Marine_species,,,,,,
305,Marine_species,"Gaiaguide,",,,,,
358,Marine_species,"Update taxonomy , Diving Canary Islands",,,,,
359,Marine_species,"Validate taxonomy of their marine species, Go...",,,,,
376,Marine_regions,?,,,,,
379,Marine_regions,?,,,,,
381,Marine_regions,?,,,,,


In [128]:
# data completeness of standardized rows:
standaffil_data = data[['Affiliation', 'stand_affil', 'stand_country', 'stand_flemish', 'stand_GROUP', 'stand_QH']]

In [129]:
# Absolute count:
stand_data_count = standaffil_data.describe().iloc[0]
stand_data_count

Affiliation      735
stand_affil      711
stand_country    615
stand_flemish    140
stand_GROUP      615
stand_QH         615
Name: count, dtype: object

In [130]:
# Percentage:
stand_data_perc = stand_data_count.divide(stand_data_count.Affiliation, axis=0).multiply(100)
stand_data_perc

Affiliation          100
stand_affil      96.7347
stand_country    83.6735
stand_flemish    19.0476
stand_GROUP      83.6735
stand_QH         83.6735
Name: count, dtype: object