In [112]:
import pandas as pd
import numpy as np
import datetime
import json
import pivottablejs
from IPython.display import HTML
import IPython.core.display as di
from IPython.display import display
from IPython.display import Markdown as md

In [113]:
#!pip install --user pyarrow

In [114]:
#df = pd.read_csv('metadata/Results-20180101.csv', low_memory=False, sep=',')
df = pd.read_csv('metadata/noronet_metadata.csv', low_memory=False, sep=',')#, index_col=0)

We should group sequences based on their Genogroups, and there are 4 columns for this info

In [115]:
s = set()
for i in range(1,5):
    s = s.union(set(df['seq %d: Genus / Genogroup'%i].unique()))
s

{'Caliciviridae Norovirus GI',
 'Caliciviridae Norovirus GII',
 'Caliciviridae Norovirus GIII',
 'Caliciviridae Norovirus GIV',
 'Caliciviridae Norovirus GV',
 'Caliciviridae Norovirus GVII',
 'Caliciviridae Sapovirus GI',
 'Caliciviridae Sapovirus GII',
 'Unassigned',
 nan}

In [116]:
available_genogroups = list(s)
available_genogroups.remove(np.nan)
available_genogroups.remove('Unassigned')
available_genogroups

['Caliciviridae Norovirus GV',
 'Caliciviridae Sapovirus GI',
 'Caliciviridae Norovirus GII',
 'Caliciviridae Sapovirus GII',
 'Caliciviridae Norovirus GVII',
 'Caliciviridae Norovirus GIV',
 'Caliciviridae Norovirus GI',
 'Caliciviridae Norovirus GIII']

In [117]:
for i in range(1,5):
    gengroup = 'seq %d: Genus / Genogroup'%i
    #print(df[gengroup].fillna().sum())
    df[df[gengroup].isna()][gengroup] = 'Unassigned'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


#### We will create a new DataFrame, but each seqX will be in a separate row
For that we need to determine, which columns/metadata will be copied along

In [118]:
#df.columns

In [119]:
#First get rid of the columns that contain a summary of other columns
drop_cols = ['ORF1', 'ORF1 variant', 'ORF2', 'ORF2 variant', 'Genus / Genogroup' ]
df = df.drop(drop_cols, axis=1)

In [120]:
copycols = list(df.columns)
#print(copycols)
copycols = ['User', 'Institute', 'Database ID', 'Country', 'Submission Date','Last Update', 'Outbreak Nr', 'Sample Date', 'Dutch lab Y/N', 'Nosocomial infection Y/N (sampledate > 2days after date of hospitalization)', 'Outbreak or sporadic case O/S',
 'Source of the sample', 'Specify source', 'Specify animal', 'Suspected country of infection', 'Date of first disease', 'Geo-coded location', 'Suspected mode of transmission',
 'Specify other mode of transmission', 'Food item', 'Specify food item', 'Setting of the outbreak', 'Specify setting', 'Point source transmission Y/N', 'Date of point source transmission',
 'Nr of persons affected', 'Nr of persons at risk', 'Nr of cases deceased', 'Nr of cases hospitalized due to infection', 'Nr of cases of ages 0 to 4', 'Nr of cases of ages 5 to 14',
 'Nr of cases of ages 15 to 64', 'Nr of cases of age 65 or older', 'Nr of cases with vomiting', 'Nr of cases with diarrhea', 'Nr of cases with vomiting AND diarrhea',
 'Mixed infection Y/N', 'Specify other pathogen(s)', 'Nr of samples tested', 'Nr of PCR positive samples', 'Nr of PCR negative samples', 'Included in II.4 P2 capsid surveillance',
 'fasta_id', 'reference_id', 'fragment_begin', 'Genus-Genogroup', 'ORF1', 'ORF1_variant', 'ORF2', 'ORF2_variant', 'Reference_sequence_for_positions', 'Cluster']

In [121]:
tdfs = []
for si in range(1,5):
    rename_dict = {'seq %d: fasta id'%(si): 'fasta_id',
                   'seq %d: reference id'%(si): 'reference_id',
                   'seq %d: fragment begin'%(si): 'fragment_begin',
                   'seq %d: fragment end'%(si): 'fragment_end',
                   'seq %d: Genus / Genogroup'%(si): 'Genus-Genogroup',
                   'seq %d: ORF1'%(si): 'ORF1',
                   'seq %d: ORF1 variant'%(si): 'ORF1_variant',
                   'seq %d: ORF2'%(si): 'ORF2',
                   'seq %d: ORF2 variant'%(si): 'ORF2_variant',
                   'seq %d: Reference sequence for positions'%(si): 'Reference_sequence_for_positions',
                   'seq %d: Cluster'%(si): 'Cluster'}
    tdfs.append(df.rename(rename_dict, axis=1)[copycols])
#     for ir, r in tdf.iterrows():
#         gengroup = 'Genus-Genogroup'
#         if r[gengroup] in available_genogroups:
#             ndf.append(r, ignore_index=True)

ndf = pd.concat(tdfs)

In [122]:
print(ndf.shape)
ndf = ndf[ndf['Genus-Genogroup'].notna()]
ndf = ndf.reset_index(level=0, drop=True)
print(ndf.shape)
print(ndf[ndf['Genus-Genogroup']=='Unassigned'].shape)

(82912, 52)
(26527, 52)
(201, 52)


## Data cleaning
drop columns with irrelevant data

Get rid of columns with no value

In [123]:
for c in ndf.columns:
    if ndf[c].notna().sum()==0:
        print("Dropping column: %s"%c)
        ndf.drop(c, axis=1, inplace=True)
    

Dropping column: Nosocomial infection Y/N (sampledate > 2days after date of hospitalization)


In [124]:
drop_irrelevant_cols = ['Dutch lab Y/N',  'Nr of cases with vomiting AND diarrhea']
ndf.drop(drop_irrelevant_cols, axis=1, inplace=True)

In [125]:
# Looks better this way
colname = 'Suspected country of infection'
ndf[ndf[colname]=='Same as reporting country'][colname] = ndf[ndf[colname]=='Same as reporting country']['Country']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


#### split ORFs into main and subtype

In [126]:
ndf['ORF1'].unique()

array([nan, 'GII.P7', 'GII.P4', 'GII.Pa', 'GII.P3', 'GII.P1', 'GII.P12',
       'GII.P2', 'GI.Pb', 'Could not assign', 'GI.P2', 'GII.P5', 'GI.P4',
       'GI.P3', 'GII.Pg', 'GII.P6', 'GII.Pf', 'GII.P21', 'GII.Ph',
       'GI.P5', 'GII.P16', 'GII.Pr', 'GII.P8', 'GII.Pj', 'GII.Pe',
       'GI.P7', 'GII.P13', 'GI.Pd', 'GI.Pa', 'GI.P1', 'GI.Pf', 'GII.P22',
       'GI.P9', 'GII.P17', 'GI.P6', 'GII.Pm', 'GI.P8', 'GII.P20',
       'GII.Pc', 'GII.Pq', 'GII.P11', 'GII.P30 (GII.Pc)',
       'GI.P13 (GI.Pd)', 'GII.P31 (GII.Pe)', 'GI.P14 (GI.Pf)',
       'GII.P21 (GII.Pb)', 'GI.P11 (GI.Pb)', 'GII.P15'], dtype=object)

In [127]:
ndf.loc[ndf['ORF1']=='Could not assign', 'ORF1'] = 'Unknown'
ndf.loc[ndf['ORF2']=='Could not assign', 'ORF2'] = 'Unknown'

In [128]:
ndf['ORF1_sub'] = ndf['ORF1'].apply(lambda x: x.split(".")[-1] if type(x) != np.float else 'Unknown')
ndf['ORF1'] = ndf['ORF1'].apply(lambda x: x.split(".")[0] if type(x) != np.float else 'Unknown')
ndf['ORF2_sub'] = ndf['ORF2'].apply(lambda x: x.split(".")[-1] if type(x) != np.float else 'Unknown')
ndf['ORF2'] = ndf['ORF2'].apply(lambda x: x.split(".")[0] if type(x) != np.float else 'Unknown')

In [129]:
# Correct Typos
ndf.loc[ndf['ORF1_sub']=='Pd)', 'ORF1_sub'] = 'Pd'
ndf.loc[ndf['ORF1_sub']=='Pc)', 'ORF1_sub'] = 'Pc'
ndf.loc[ndf['ORF1_sub']=='Pe)', 'ORF1_sub'] = 'Pe'

### Sort out columns depending on their type of data
* with boolean (e.g. yes/no)
* with only strings
* with few integers
* with a range of numbers
* with datetime data

Then we could put columns with fewer unique values in the columns of the pivottable

In [130]:
booleans = ['Outbreak or sporadic case O/S', 'Point source transmission Y/N', 'Mixed infection Y/N', 'Included in II.4 P2 capsid surveillance']
numbers = [ 'Nr of persons affected', 'Nr of persons at risk', 'Nr of cases deceased', 'Nr of cases hospitalized due to infection', 'Nr of cases of ages 0 to 4',
           'Nr of cases of ages 5 to 14', 'Nr of cases of ages 15 to 64', 'Nr of cases of age 65 or older', 'Nr of cases with vomiting', 'Nr of cases with diarrhea',
            'Nr of samples tested', 'Nr of PCR positive samples', 'Nr of PCR negative samples']
geo = [ 'Institute', 'Country', 'Suspected country of infection']
dates = ['Submission Date', 'Last Update', 'Sample Date','Date of first disease','Date of point source transmission',]
ids = ['User', 'Database ID', 'Outbreak Nr']
pathological = ['Source of the sample', 'Specify source', 'Suspected mode of transmission', 'Specify other mode of transmission', 'Food item', 'Specify food item',
 'Setting of the outbreak', 'Specify setting', 'Specify other pathogen(s)']
seq_data =['fasta_id', 'reference_id', 'fragment_begin', 'Genus-Genogroup', 'ORF1', 'ORF1_sub', 'ORF1_variant', 'ORF2', 'ORF2_sub', 'ORF2_variant', 'Reference_sequence_for_positions', 'Cluster']
etc = ['Geo-coded location']

### Convert some columns into categorycal type

In [131]:
for c in ['Country', 'ORF1_sub', 'ORF2_sub', 'Genus-Genogroup', 'Setting of the outbreak']:
    ndf[c] = ndf[c].astype('category')

### convert date to datetime

In [132]:
dates

['Submission Date',
 'Last Update',
 'Sample Date',
 'Date of first disease',
 'Date of point source transmission']

In [133]:
 a = pd.to_datetime(ndf['Sample Date'], errors='coerce')

In [134]:
a[a==a.isna()]


Series([], Name: Sample Date, dtype: datetime64[ns])

In [135]:
for i in range(len(dates)):
    print(i)
    ndf.loc[:, dates[i]] = pd.to_datetime(ndf[dates[i]], errors='coerce')

0
1
2
3
4


In [136]:
ndf[dates].head()

Unnamed: 0,Submission Date,Last Update,Sample Date,Date of first disease,Date of point source transmission
0,1992-01-01,NaT,1992-01-01,1992-01-01,NaT
1,1994-01-01,NaT,1994-01-01,1994-01-01,NaT
2,1994-01-01,NaT,1994-01-01,1994-01-01,NaT
3,1997-01-01,NaT,1997-01-01,1997-01-01,NaT
4,1997-01-01,NaT,1997-01-01,1997-01-01,NaT


In [137]:
ndf = ndf[ndf['Sample Date'].notna()]

In [142]:
ndf = ndf.reset_index(drop=True)

### Saving data

In [143]:
# dataframe
#output = "noronet_clean.fr"
output = "noronet_all_clean.fr"
#output_categories = "categories.json"
output_categories = "categories_all.json"
ndf.to_feather(output)

# categories
dict_categories = {'geo':geo,
                  'booleans':booleans,
                  'numbers': numbers,
                  'dates': dates,
                   'ids': ids,
                   'pathological': pathological,
                   'seq_data': seq_data,
                   'etc': etc
                  }

with open(output_categories, 'w') as f:
    f.write(json.dumps(dict_categories))


### Lets see each categories in detail, and check if we didn't miss any detail

In [None]:
#booleans
for c in booleans:
    print(ndf[c].unique())
ndf[booleans].info()

In [None]:
#numbers
dict_numbers_unique_values = { c: len(ndf[c].unique()) for c in numbers}
print("Number of unique values for each column")
for k in dict_numbers_unique_values.keys():
    print(k, "\t",dict_numbers_unique_values[k])

print("\nInformations about the numbers category")
ndf[numbers].info()

In [None]:
#geo
dict_unique_values = { c: len(ndf[c].unique()) for c in geo}
print("Number of unique values for each column")
for k in dict_unique_values.keys():
    print(k, "\t",dict_unique_values[k])

print("\nInformations about the geo data")
ndf[geo].info()

In [None]:
#dates
dict_unique_values = { c: len(ndf[c].unique()) for c in dates}
print("Number of unique values for each column")
for k in dict_unique_values.keys():
    print(k, "\t",dict_unique_values[k])

print("\nInformations about the numbers category")
ndf[dates].info()

In [None]:
#pathological
dict_unique_values = { c: len(ndf[c].unique()) for c in pathological}
print("Number of unique values for each column")
for k in dict_unique_values.keys():
    print(k, "\t",dict_unique_values[k])

print("\nInformations about the pathological category")
ndf[pathological].info()

In [None]:
#seq
print("\nInformations about the the sequences")
ndf[seq_data].info()