In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
plt.style.use('ggplot')

In [15]:
def generate_split_sets(df,target_col = 'higher_modality',test_size = 0.12, val_size = 0.2,seed = 42):
    df_tr, df_test, _,_ = train_test_split(df.copy(),df.copy()[target_col],test_size = test_size,random_state = seed)
    df_train, df_val, _,_ = train_test_split(df_tr.copy(),df_tr.copy()[target_col],test_size = val_size,random_state = seed)
    df_train.reset_index(drop = True,inplace = True),df_val.reset_index(drop = True,inplace = True),df_test.reset_index(drop = True,inplace = True)
    df_train.loc[:,'split_set'] = 'TRAIN'
    df_val.loc[:,'split_set'] = 'VAL'
    df_test.loc[:,'split_set'] = 'TEST'
    del df_tr
    return df_train,df_val,df_test

In [16]:
# Clef Image Dataset
columns = ['img','modality','source','img_path','higher_modality','split_set']

df_clef_GCEL = pd.read_csv('../data/higher_modality_vol1.csv', sep='\t')
df_clef_GCEL = df_clef_GCEL[df_clef_GCEL['higher_modality']=='EXPERIMENTAL'].reset_index(drop = True)
df_clef_GCEL['modality'] = df_clef_GCEL['modality'].replace({'GGEL':'GEL'})
df_clef_GCEL['img_path'] = df_clef_GCEL['img_path']

# GCEL Images
df_gel = pd.read_csv('/mnt/gels/udel_gel_batch_1/list.csv')
df_gel['img'] = df_gel["filepath"].str.split("/",expand = True)[2] 
df_gel['img_path'] = 'gels/udel_gel_batch_1' + df_gel['filepath']
df_gel['higher_modality']  = 'EXPERIMENTAL'
df_gel['modality']         = 'GEL'
df_gel['source']           = 'PUBMED'
df_gel_train,df_gel_val,df_gel_test = generate_split_sets(df_gel,target_col = 'higher_modality')
df_gel = pd.concat([df_gel_train,df_gel_val,df_gel_test],axis = 0).reset_index(drop = True)
del df_gel_train,df_gel_val,df_gel_test
# GCEL Images
df_plates = pd.read_csv('/mnt/plates/udel_plates_batch_1/list.csv')
df_plates['img'] = df_plates["filepath"].str.split("/",expand = True)[2] 
df_plates['img_path'] = 'plates/udel_plates_batch_1' + df_plates['filepath']
df_plates['higher_modality']  = 'EXPERIMENTAL'
df_plates['modality']         = 'PLATES'
df_plates['source']           = 'PUBMED'
df_plates_train,df_plates_val,df_plates_test = generate_split_sets(df_plates,target_col = 'higher_modality')
df_plates = pd.concat([df_plates_train,df_plates_val,df_plates_test],axis = 0).reset_index(drop = True)
del df_plates_train,df_plates_val,df_plates_test

# Get final dataset
df = pd.concat([df_clef_GCEL[columns],df_gel[columns],df_plates[columns]],axis = 0).reset_index(drop = True)
del df_clef_GCEL,df_gel,df_plates
df.to_csv('../data/experimental_dataset.csv',sep = '\t',index = False)

In [17]:
df['split_set'].value_counts(normalize = True)

TRAIN    0.696508
VAL      0.174127
TEST     0.129366
Name: split_set, dtype: float64

In [18]:
df[df.isin(['TRAIN','VAL'])]['split_set'].value_counts(normalize =True)

TRAIN    0.8
VAL      0.2
Name: split_set, dtype: float64

In [19]:
df['source'].value_counts(normalize =True)

PUBMED    0.961776
clef16    0.035361
clef13    0.002864
Name: source, dtype: float64

In [20]:
df = pd.read_csv('../data/experimental_dataset.csv',sep = '\t')
df['target'] = df['modality'].copy()
df.to_csv('../data/experimental_dataset.csv',sep = '\t',index = False)