# Data Set

In [7]:
import os
import pandas as pd

train_path = 'Data/Train'
test_path = 'Data/Test'

train_labels = pd.read_csv('Data/train_labels.csv')

# Quickly look at the training_labels
train_labels['label'].value_counts()

0    130908
1     89117
Name: label, dtype: int64

## About the Data Set

### What data is available?
220K labeled images and with 57K evaluation images. The images are of size 96x96x3. The train_labels.csv file loaded gives us the training set's labels in the format (label,fileName). 

- Label 0: Indicates that the image from [fileName] is negative (no cancer present).
- Label 1: Indicates that the image from [fileName] is positive (cancer present).

The labeled images have approximately a 60/40 split between negative and positive examples.

*According to the data set providers, the labeled examples and evaluation images come from the same distribution. We will assume this to be true.

### How will we split the train data set?


We will split the labeled images into a 90/10 train/dev set below.

In [8]:
neg_mask = train_labels['label'] == 0
negative_examples = train_labels[neg_mask]
negative_examples_count = negative_examples.shape[0]

positive_examples = train_labels[~neg_mask]
positive_examples_count = positive_examples.shape[0]

train_neg_examples_count = int(0.9 * negative_examples_count)
train_pos_examples_count = int(0.9 * positive_examples_count)

#Place 90% of +/- examples into train_set
train_set_pos_examples = positive_examples.iloc[:train_pos_examples_count]
train_set_neg_examples = negative_examples.iloc[:train_neg_examples_count]
train_set = pd.concat([train_set_pos_examples, train_set_neg_examples])

#Place remaining 10% of +/- examples into dev_set
dev_set_pos_examples = positive_examples.iloc[train_pos_examples_count:]
dev_set_neg_examples = negative_examples.iloc[train_neg_examples_count:]
dev_set = pd.concat([dev_set_pos_examples, dev_set_neg_examples])

In [9]:
print(train_set)

                                              id  label
1       c18f2d887b7ae4f6742ee445113fa1aef383ed77      1
6       a24ce148f6ffa7ef8eefb4efb12ebffe8dd700da      1
7       7f6ccae485af121e0b6ee733022e226ee6b0c65f      1
11      c3d660212bf2a11c994e0eadff13770a9927b731      1
14      94fa32b29cc1c00403176c0795fffa3cfaa0f20e      1
17      0b820b71670c039dd0a51333d1c919f471a9e940      1
19      d34af1e7500f2f3de41b0e6fdeb2ed245d814590      1
23      464327050ef07bb927f8bfb5c4e4dd5ebd4d3c09      1
24      6961bdcc16f6c1d7db88fc6a7823178288c2a29e      1
28      233bf46a575c1731821073e318c029e5df8b12ff      1
29      e6e31b49681a06c68aca9b64c42a4f002189a081      1
32      cb249c0b6e734bfaef451192ba5f25439bbac754      1
34      68f07184e9a7abac01f55470c17a6c4beffba09a      1
45      5300830e2fd06f96bb43e6fa21220222bb672421      1
50      6ea0128557604ba59ff2c4668e9b22677a587214      1
57      2a88b4c7dc358944bf6791218ddc582082092915      1
62      240305865761bc3f280f3058cdcf87c669e0a2f7

In [10]:
print(dev_set)

                                              id  label
197963  a0bec18382cb44c62d91c28dfe80486cdd14ef6c      1
197970  582981ad21e8c03d826c61032e947bf2c6f518ad      1
197972  975ecb969b9c9d61356400ecd89ab8bea1b7b89b      1
197973  959be9496930b29b95171ab9d19cea23f1d0e068      1
197985  de6a2eddbf0b7940d13acb20cf50936452ac1c75      1
197988  a24410c2c6a9b675ac75c9939da20eacae064068      1
197995  e746604a2adcb5430e3723aa1cf05bb5e10c8641      1
197997  b8fd524aa6a31649b631b7c99cd4d48688c49358      1
198000  ba60edf241b16bf5d05021014eec91390c40a7f8      1
198003  f66012005b1ce3ea5081988a18d8fe16429c14bf      1
198004  d253a02d2b3910858a9faac01283157355f69fba      1
198014  283261787bd8692e0a0b199f05333e11f08baef2      1
198018  dad44fa09c260c4eb1bfeab265b7ce671a835c65      1
198019  1fa588f45c7b6a10c8fa7188a8d32414edab0ea7      1
198029  296754d5b5f266abb482dca677bc5b322768c5af      1
198034  6c4a962433f9551b07a24d6a8dd5db4f2fde47bf      1
198035  64712d863ae7fa904aeda269fc784c6e7a7cf32b

Due to how I generated the train/dev set, all the negative and positive examples are grouped together.

Shuffling the train/dev set will give our model better performance.

In [11]:
# Shuffle the train_set and dev_set
train_set = train_set.sample(frac=1)
dev_set = dev_set.sample(frac=1)

In [12]:
# Check that train_set is shuffled
print(train_set)

                                              id  label
60552   78fa40a0656402378c845012d6cba58a9e902bf9      0
196870  80fb7fe681a203be45f6c46a86ef3a2c22bc7fe6      1
31732   8ccf30320f1b74b217185ff58686d7cee4571891      0
68787   be637b31d7d2d3ee49e47418a10d4297cf9a09cf      0
166590  b54fa48c3d14c74a4688323f5ba2591792afeb44      1
60039   876c580d1da8f55e1800f06ae2e393116b9114c0      0
43799   c45a9bf3814d9160fa1c6a46f7639b5f1fa823b2      1
110195  9e5202660cd49535d43d7188ae23578bc84662db      0
99758   80fd967ca33fb4a42fdfe17c1346cf1c4933d4de      0
16504   38504723727ea2d389f5424adfad3f158becb693      1
61026   471b992f13e75a38e220450aea4d1406bde6ac82      0
28456   5f08522b054b85928374dfe9e37f69fbbf277334      0
24542   d4b4f8aa4a843faf3448d409e02b7370c2315943      0
87000   8d4cd1870899a1cc22534e222a70d4bd90740c20      0
71188   dc3ac487b2dcf061622fce0f9b20a1f2e331fa5b      1
55613   46fdf2e01416fa43386486efa19f9f03808f54d8      0
130915  8936d9a5ce16cce6cab255d9880db05193f2ba6a

In [13]:
# Check that dev_set is shuffled

print(dev_set)

                                              id  label
203471  e5895d6810411ba9f5f44e05b48460b7b0fef98f      0
208163  13671c9212e5c4158fc552b1b0b8fd629702a7ca      0
198917  01f7890d7571b3d5bfc7a2df47788dfd9e2ecd9e      0
215221  f63848f8ef20b034d1d789b754daf0394c793f56      0
202459  b49c2989859710e33d8caa32d142fc637fe0d805      0
199756  93e68baf43b8dcb6f2e0c7eab530545081e2e35c      0
208180  03c067a59309f431c873ecf7a612d9f63a3f39d8      1
218857  2cc88699e75286f8cef1760888f9778840c32154      0
206611  e42b7db0d3bf97dd5e2c7bfb9cf533d82ddb5d24      1
202164  6ec32c0edda1a4296c998457db04b2cf959ed86f      1
215268  185b46d5c3450744fb943bf27d869a2feff5d77e      1
215579  c83183c0f0c3fbf181896d50a3a2b1f104480fad      0
208691  d0d14c020cd53500d37fb7faff719ea948e159dc      1
202995  7119f4b073999957ec8ae9fd9858d243e46e63a4      0
204296  a1dfa269a94bff0b086403c6d7b69fbcb659eeb1      0
201776  cae7a634f712bae11741eb0d6c8d6160da0622c4      0
200786  8becabf1f4a083dcb07e1dbd477fae16a295411f

In [14]:
# Returns user-defined the amount of examples from the training set.
# If possible, will return a 50/50 split between negative/positive examples.
def load_subset_train_labels(sample_size):
    desired_pos_examples = int(0.5 * sample_size)
    desired_neg_examples = int(0.5 * sample_size)
    curr_train_set_pos_examples = positive_examples.iloc[:desired_pos_examples]
    curr_train_set_neg_examples = negative_examples.iloc[:desired_neg_examples]
    curr_train_set = pd.concat([curr_train_set_pos_examples, curr_train_set_neg_examples])
    curr_train_set = curr_train_set.sample(frac=1)
    
    return curr_train_set

In [15]:
# We see that load_subset_train_labels(2500) gave us a 50/50 class split of 2500 train labels
temp = load_subset_train_labels(2500)
print(temp[temp['label'] == 0])
print(temp[temp['label'] == 1])

                                            id  label
2003  b6eaceae21723a97094e466615b698645d37deb7      0
10    a106469bbfda4cdc5a9da7ac0152927bf1b4a92d      0
1691  506c571c0b56b055b5b71a342feeb9addd0cb5b5      0
764   ce2cd44ab5a254012de7fb3de57e585aaa8d48f1      0
1152  76f8b098d94da7553fb5c8eb6da3f1b08d0cb49f      0
1655  aafbdfd08f338fa8eb59e76e4e5a026e81b1915b      0
1379  fb0dfac3b36a37c410e8c623af893105710fda0d      0
15    f416de7491a31951f79b3cee75b002f4d1bf0162      0
1373  ec18e31acc743edd6c0df731910ba18be6763adc      0
1681  e868c4944f021710656bf9cfe21641696139ca3c      0
467   846fca0efde134467495841bf54fef20e43d089d      0
1123  67739d1b6003f0368a04a2c2e0c8a44dc8deeeb5      0
1260  4707ac5f6378accbad6d3274fe42f75513744cd6      0
1728  609e5cd506f1e6aea54c4034d2cfb4414deb461f      0
982   20df3ece364e88719559514477dd5beebeb5e7e4      0
610   d8f7dce5e7480a9eb06c47cd63f54cfc39133a34      0
1677  29f492332bafa3dabf4de4d30ec587e1e7051ab1      0
413   707dd6e602da401d7789ac

In [19]:
temp.iloc[3, 0]

'506c571c0b56b055b5b71a342feeb9addd0cb5b5'