The purpose of this notebook is to see if we can decrease the training time between experiments. We do this by using only a subset of the external data set then we will use the external dataset and the internal validation dataset to see the performance of the model

In [132]:
%load_ext autoreload
%autoreload 2

import sys
from collections import Counter

from pprint import pprint
import pandas as pd

from src.data import DataPaths
from src.data import single_class_counter

sys.path.append("../..")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


We first look at the distribution of the labels bewteen both dataset

In [103]:
all_labels_df = pd.read_csv(DataPaths.TRAIN_ALL_LABELS)
all_labels_df['Target'] = [[int(i) for i in s.split()] for s in all_labels_df['Target']]

In [31]:
hpa_labels_df = pd.read_csv(DataPaths.TRAIN_HPA_V18_LABELS)
hpa_labels_df['Target'] = [[int(i) for i in s.split()] for s in hpa_labels_df['Target']]

In [32]:
pprint(sorted(single_class_counter(hpa_labels_df['Target'].values), key=lambda x: x[1]))

[(15, 0.0003679981775328351),
 (24, 0.0009287573052019171),
 (27, 0.0010163759189002113),
 (9, 0.0013318029282140698),
 (10, 0.0013493266509537285),
 (8, 0.0014369452646520228),
 (17, 0.00206779928327974),
 (20, 0.002330655124374622),
 (26, 0.003311983597795516),
 (16, 0.006659014641070349),
 (13, 0.008069674321612883),
 (18, 0.008683004617500942),
 (11, 0.009646809368182177),
 (12, 0.013537075816386433),
 (14, 0.014246786587342615),
 (3, 0.015490970901858392),
 (1, 0.01592906397034986),
 (22, 0.016884106859661265),
 (19, 0.0191884763999264),
 (6, 0.023805977341826497),
 (4, 0.02866881040208182),
 (5, 0.030009375191665716),
 (7, 0.05767933339758698),
 (2, 0.06352349493126319),
 (23, 0.06466253690934103),
 (21, 0.0878989932621286),
 (0, 0.24597173423522092),
 (25, 0.2553031165940893)]


In [30]:
kaggle_labels_df = pd.read_csv(DataPaths.TRAIN_LABELS)
kaggle_labels_df['Target'] = [[int(i) for i in s.split()] for s in kaggle_labels_df['Target']]

In [72]:
pprint(sorted(single_class_counter(kaggle_labels_df['Target'].values), key=lambda x: x[1]))

[(27, 0.0002166121854200307),
 (15, 0.000413532353983695),
 (10, 0.00055137647197826),
 (9, 0.0008861407585364893),
 (8, 0.0010436768933874208),
 (20, 0.0033870268992950256),
 (17, 0.00413532353983695),
 (24, 0.00634082942774999),
 (26, 0.006458981528888188),
 (16, 0.010436768933874208),
 (13, 0.010574613051868773),
 (12, 0.013548107597180102),
 (22, 0.015792997518805876),
 (18, 0.017762199204442518),
 (6, 0.01984955299121736),
 (14, 0.020991689968886614),
 (11, 0.021523374424008507),
 (1, 0.0246937891378835),
 (19, 0.029183568981135048),
 (3, 0.030739238312787995),
 (4, 0.03658776731912883),
 (5, 0.049486038360048834),
 (7, 0.05557087156866606),
 (23, 0.05838682997912646),
 (2, 0.07130479303690283),
 (21, 0.074376747666496),
 (25, 0.16202591469418298),
 (0, 0.25373163719428143)]


In [88]:
# def multi_class_counter(labels):
# sorted(Counter(tuple(l) for l in kaggle_labels_df['Target'].values).items(), key=lambda x: x[1])

Now we decide what to set the threshold as

In [95]:
def create_combined_training_examples(kaggle_labels_df, hpa_labels_df, threshold=0.02):
    include_below = 0.02
    included_labels = []
    for label, proportion in sorted(single_class_counter(kaggle_labels_df['Target'].values), key=lambda x: x[1]):
        if proportion < include_below:
            included_labels.append(label)
    rare_labels_from_hpa_df = hpa_labels_df[hpa_labels_df['Target'].map(lambda x: len(set(x) & set(included_labels)) > 0)]
    combined_training_df = pd.concat([kaggle_labels_df, rare_labels_from_hpa_df])
    return combined_training_df

In [202]:
threshold = 0.02
combined_training_df = create_combined_training_examples(kaggle_labels_df, hpa_labels_df)

In [205]:
combined_training_df.shape

(40815, 2)

In [206]:
# pprint(sorted(single_class_counter(combined_training_df['Target'].values), key=lambda x: x[1]))

In [207]:
val_ids = list(set(hpa_labels_df['Id'].values) - set(combined_training_df['Id'].values))

In [208]:
combined_val_df = all_labels_df[all_labels_df['Id'].isin(val_ids)]

In [209]:
combined_val_df.shape

(64863, 2)

Save the label csvs

In [210]:
combined_training_df['Target'] = combined_training_df['Target'].map(lambda x: " ".join(str(i) for i in x))
combined_val_df['Target'] = combined_val_df['Target'].map(lambda x: " ".join(str(i) for i in x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [211]:
combined_training_df.head()

Unnamed: 0,Id,Target
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,16 0
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,7 1 2 0
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,5
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,1
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,18


In [212]:
combined_val_df.head()

Unnamed: 0,Id,Target
31090,11651_1601_F3_3,23 0
31091,11651_1601_F3_4,23 0
31092,11651_1611_B3_2,23 0
31093,11651_1611_B3_3,23 0
31094,11651_1732_F3_13_cr580610dc203b9,23 0


In [213]:
combined_training_df.to_csv(DataPaths.TRAIN_HPA_KAGGLE_THRESH_0_02_LABELS, index=False)
combined_val_df.to_csv(DataPaths.VAL_HPA_KAGGLE_THRESH_0_02_LABELS, index=False)

Now generate the soft links

In [181]:
import os
from tqdm import tqdm_notebook

In [189]:
train_ids = combined_training_df['Id'].values
training_images_folder = DataPaths.TRAIN_HPA_KAGGLE_THRESH_0_02_COMBINED_IMAGES

In [194]:
val_ids = combined_val_df['Id'].values
val_images_folder = DataPaths.VAL_HPA_KAGGLE_THRESH_0_02_COMBINED_IMAGES

In [196]:
def generate_soft_links(source_images_folder, dest_images_folder, dest_images_id):
    dest_images_folder.mkdir(exist_ok=True, parents=True)
    id_lookups = Counter(dest_images_id)
    for p in source_images_folder.glob("*"):
        if id_lookups.get(p.stem) is not None:
            os.symlink(src=p, dst=dest_images_folder / p.name)

In [188]:
generate_soft_links(DataPaths.TRAIN_ALL_COMBINED_IMAGES, training_images_folder, train_ids)

FileExistsError: [Errno 17] File exists: '../../data/train_all_combined/711f1d5e-bbb2-11e8-b2ba-ac1f6b6435d0.npy' -> '../../data/train_hpa_kaggle_thresh_0_02/711f1d5e-bbb2-11e8-b2ba-ac1f6b6435d0.npy'

In [197]:
generate_soft_links(DataPaths.TRAIN_ALL_COMBINED_IMAGES, val_images_folder, val_ids)