In [10]:
import pandas as pd
from pathlib import Path

base_folder = Path('/workspace/data/subfigure-classification')
folders = {
    'OPENI_GRAPHICS': ['GRAPHICS', 'GFIG'],
    'OPENI_CT': ['ORGANISMS', 'DRCT'],
    'OPENI_MICROSCOPY': ['MICROSCOPY', ''],
    'OPENI_MRI': ['ORGANISMS', 'DRMR'],
    'OPENI_OTHER': ['OTHER', ''],
    'OPENI_ULTRASOUND': ['ORGANISMS', 'DRUS'],
    'OPENI_XRAY': ['ORGANISMS', 'DRXR']
}

In [2]:
original_doc = '/workspace/data/higher_modality.csv'
df_original = pd.read_csv(original_doc, sep='\t')
df_original.head()

Unnamed: 0,img,modality,set,source,img_path,higher_modality,caption,split_set
0,11373_2007_9226_Fig1_HTML-10.jpg,DMFL,TRAIN,clef16,2016/train/DMFL/11373_2007_9226_Fig1_HTML-10.jpg,MICROSCOPY,"Colocalization of hNopp140, pol I and rDNA rep...",TRAIN
1,11373_2007_9226_Fig1_HTML-11.jpg,DMFL,TRAIN,clef16,2016/train/DMFL/11373_2007_9226_Fig1_HTML-11.jpg,MICROSCOPY,"Colocalization of hNopp140, pol I and rDNA rep...",TRAIN
2,11373_2007_9226_Fig1_HTML-12.jpg,DMFL,TRAIN,clef16,2016/train/DMFL/11373_2007_9226_Fig1_HTML-12.jpg,MICROSCOPY,"Colocalization of hNopp140, pol I and rDNA rep...",TRAIN
3,11373_2007_9226_Fig1_HTML-13.jpg,DMFL,TRAIN,clef16,2016/train/DMFL/11373_2007_9226_Fig1_HTML-13.jpg,MICROSCOPY,"Colocalization of hNopp140, pol I and rDNA rep...",TRAIN
4,11373_2007_9226_Fig1_HTML-14.jpg,DMFL,TRAIN,clef16,2016/train/DMFL/11373_2007_9226_Fig1_HTML-14.jpg,MICROSCOPY,"Colocalization of hNopp140, pol I and rDNA rep...",TRAIN


In [31]:
import os
rows = []

for folder_name in folders.keys():
    folder_path = base_folder / folder_name
    documents = [x for x in os.listdir(folder_path) if x[-4:] == '.png']
    for doc in documents:
        rows.append([doc,
                     folders[folder_name][1],
                     'TRAIN',
                     'openi',
                     f"{folder_name}/{doc}",
                     folders[folder_name][0],
                     'skipped',
                     'TRAIN'])

In [32]:
header = ['img', 'modality', 'set', 'source', 'img_path', 'higher_modality', 'caption', 'split_set']
rows.insert(0, header)

In [33]:
import csv
output_csv = '/workspace/data/openi_vol1.csv'
with open(output_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    for row in rows:
        writer.writerow(row)

In [36]:
df_openi = pd.read_csv(output_csv, sep='\t')
df = pd.concat([df_original, df_openi])
df.shape

(14368, 8)

In [37]:
df_openi.head()

Unnamed: 0,img,modality,set,source,img_path,higher_modality,caption,split_set
0,PMC4387739_12883_2015_303_Fig2_HTML-19.png,GFIG,TRAIN,openi,OPENI_GRAPHICS/PMC4387739_12883_2015_303_Fig2_...,GRAPHICS,skipped,TRAIN
1,PMC4387739_12883_2015_303_Fig2_HTML-20.png,GFIG,TRAIN,openi,OPENI_GRAPHICS/PMC4387739_12883_2015_303_Fig2_...,GRAPHICS,skipped,TRAIN
2,PMC4581556_NP2015-585783.003-2.png,GFIG,TRAIN,openi,OPENI_GRAPHICS/PMC4581556_NP2015-585783.003-2.png,GRAPHICS,skipped,TRAIN
3,PMC4727788_gr2-2.png,GFIG,TRAIN,openi,OPENI_GRAPHICS/PMC4727788_gr2-2.png,GRAPHICS,skipped,TRAIN
4,PMC4729030_13000_2016_455_Fig1_HTML-4.png,GFIG,TRAIN,openi,OPENI_GRAPHICS/PMC4729030_13000_2016_455_Fig1_...,GRAPHICS,skipped,TRAIN


In [38]:
df_openi.higher_modality.unique()

array(['GRAPHICS', 'ORGANISMS', 'MICROSCOPY', 'OTHER'], dtype=object)

In [40]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

random_state = 443

df_train = df[df.set == 'TRAIN']
X = df_train.img.values

y = df_train.higher_modality.values
le = LabelEncoder()
le.fit(y)
y_ = le.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_, test_size=0.2, random_state=random_state, stratify=y_)

def get_split_set(x, set_name):
    if set_name == 'TEST': return 'TEST'
    if x in X_train:
        return 'TRAIN'
    else: return 'VAL'
    
df.loc[:, 'split_set'] = df.apply(lambda x: get_split_set(x.img, x.set), axis=1)


In [42]:
output_path = '/workspace/data/higher_modality_vol1.csv'
df.to_csv(output_path, sep='\t', index=False)