# Interactive Semi Supervised Learning 
* This notebook demonstrates how to train a classfier from unlabeled data using semi supervised learning and factes visualizaiton tool.
* Assumes mnist dataset is unlabelled. We throw out the labels and try to classify only using user's input. 
* Uses facets tool for visualization
* User tags and labels data by clicking on the facets tool
* We modified facets to interactively stores labeled data into browser cache
* Load data from browser's cache to jupyter notebook and trains classifier based on user's choice. 
* Allows user to interactively repeat and retrain. 
* After couple of iteration the model improves significantly

# Step 1: Initialize and create initial visualization (do not need to repeat)
* Import facets library
* Get mnist data
* Visualize mnist data on facets

In [2]:
import sys
sys.path.append("../facets")

In [3]:
from dive import Facets

In [4]:
from tensorflow.examples.tutorials.mnist import input_data
import sklearn
from PIL import Image
import pandas as pd

mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
#x, y = mnist.train.next_batch(60000)
#x, y = sklearn.utils.shuffle(x, y, random_state=0)
#x = x.reshape((x.shape[0], 28, 28, 1))
x_test, y_test = mnist.test.next_batch(2500)
x_test = x_test.reshape((x_test.shape[0], 28, 28, 1))

def array_to_sprite_atlas(image_array, num_sprites_x, num_sprites_y):
    "Takes an array of images of shape (num_images, img_width, img_height) and splices them together to form a big ass mosaic (sprite atlas)."
    # Mnist arrays are in 0-1 range, PIL needs 0-255
    image_array = image_array * 255
    image_width, image_height = image_array.shape[1], image_array.shape[2]
    atlas_width  = num_sprites_x * image_width
    atlas_height = num_sprites_y * image_height
    # We paste the samples to get indices arranged in the following way:
    # | 0 | 1 | 2 | 3 |
    # | 4 | 5 | 6 | 7 |
    atlas  = Image.new("RGB", (atlas_width, atlas_height), (0, 0, 0))
    for i in range(num_sprites_y): 
        for j in range(num_sprites_x):
            sample = image_array[num_sprites_x * i + j, :, :]
            image = Image.fromarray(sample)
            atlas.paste(image, (j*image_width, i*image_height))
    return atlas
atlas = array_to_sprite_atlas(x_test.reshape(x_test.shape[0], 28, 28), 50, 50)
atlas.save("atlas.jpg", "JPEG")

df = pd.DataFrame()
df['Id'] = [x for x in range(len(x_test))]
jsonstr = df.to_json(orient='records')

fc = Facets()
fc.create_classes(labels=['label_' + str(x) for x in range(0,10)])
fc.define_atlas(df, sprite_width=28, sprite_height=28, atlas_url='atlas.jpg')
fc.render_html('testing.html')

  from ._conv import register_converters as _register_converters


Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


# Initial Labels
## [Click to Label](testing.html)
* Select group you want to label <img src="screenshots/pick_group.png">


* Have 2-3 good examples for each group by clicking on the digits <img src="screenshots/click_examples.png">



* Must have at least one good example for each digit for this notebook to work

# Step 2: Initialize Dictionary  (do not need to repeat)

In [31]:
d = {}
for x in range(0,10):
    d['label_'+str(x)] = '' 

In [33]:
d

{'label_0': ['2259'],
 'label_1': ['2157'],
 'label_2': ['2442'],
 'label_3': ['332'],
 'label_4': ['234'],
 'label_5': ['972'],
 'label_6': ['721'],
 'label_7': ['736'],
 'label_8': ['410'],
 'label_9': ['176']}

# Step 3: Prepare Training Set  (repeat after step 5 )

In [48]:
fc.create_labeled_variables('d')

<IPython.core.display.Javascript object>

In [49]:
print d

{'label_4': ['216', '1269', '1268', '1093', '1824', '234', '1911', '1910', '1395', '1897', '284', '1917', '804'], 'label_5': ['2454', '760', '2436', '1127', '1858', '2203', '972', '695', '1779', '1856', '345', '1177', '2200', '1799', '1215', '1130', '1151', '1796', '1865', '1867', '2189', '724', '1682', '2416', '1413', '359'], 'label_6': ['1704', '1828', '896', '1847', '1153', '1671', '1875', '1102', '721', '829', '1508', '821', '1005', '1879'], 'label_7': ['1234', '1226', '1553', '887', '1051', '1120', '1218', '1273', '1274', '1125', '736', '1676'], 'label_0': ['1160', '1983', '1188', '2463', '2259', '1822', '380', '353', '1451', '2400', '2258', '2014', '770', '1882', '1172'], 'label_1': ['2136', '2395', '2413', '2157', '2393', '1376', '2399', '2106', '2409'], 'label_2': ['2442', '975', '1556', '581', '2180', '1533', '2021', '1686', '1016', '72'], 'label_3': ['1961', '332', '498', '1473', '2226', '1778', '2111', '423', '2261', '1390', '449', '1098', '1912', '1598', '481', '1042', '227

In [50]:
df.head()

Unnamed: 0,Id,Labels,ProbOne,ProbZeros,ProbTwos,ProbThrees,ProbFours,ProbFives,ProbSixs,ProbSevens,ProbEights,ProbNines,IsTraining
0,0,label_3,0.18,0.0,0.0,0.26,0.04,0.2,0.16,0.08,0.04,0.04,-1
1,1,label_1,0.84,0.0,0.02,0.0,0.02,0.0,0.0,0.06,0.06,0.0,-1
2,2,label_1,0.18,0.08,0.02,0.12,0.16,0.14,0.12,0.06,0.02,0.1,-1
3,3,label_4,0.02,0.02,0.06,0.02,0.46,0.06,0.06,0.04,0.06,0.2,-1
4,4,label_1,0.26,0.0,0.1,0.14,0.04,0.06,0.08,0.02,0.22,0.08,-1


In [51]:
omit_d = {}
for x in d:
    for y in  d[x]:
        omit_d[y] = x
omit_d

{'1003': 'label_9',
 '1005': 'label_6',
 '1016': 'label_2',
 '1042': 'label_3',
 '1051': 'label_7',
 '1057': 'label_8',
 '1087': 'label_9',
 '1093': 'label_4',
 '1098': 'label_3',
 '1102': 'label_6',
 '1120': 'label_7',
 '1125': 'label_7',
 '1127': 'label_5',
 '1130': 'label_5',
 '1134': 'label_8',
 '1151': 'label_5',
 '1153': 'label_6',
 '1160': 'label_0',
 '1172': 'label_0',
 '1177': 'label_5',
 '1188': 'label_0',
 '1190': 'label_8',
 '1215': 'label_5',
 '1218': 'label_7',
 '1226': 'label_7',
 '1234': 'label_7',
 '1268': 'label_4',
 '1269': 'label_4',
 '1273': 'label_7',
 '1274': 'label_7',
 '1323': 'label_9',
 '1334': 'label_8',
 '1376': 'label_1',
 '1390': 'label_3',
 '1395': 'label_4',
 '1402': 'label_9',
 '1413': 'label_5',
 '1451': 'label_0',
 '1473': 'label_3',
 '1492': 'label_9',
 '1508': 'label_6',
 '1533': 'label_2',
 '1540': 'label_8',
 '1553': 'label_7',
 '1556': 'label_2',
 '1598': 'label_3',
 '1610': 'label_8',
 '1622': 'label_8',
 '1624': 'label_3',
 '1671': 'label_6',


In [52]:
df['Omit'] = df.Id.apply(lambda x: omit_d[str(x)]  if str(x) in omit_d else -1)

In [53]:
df.Omit.value_counts()

-1         2341
label_5      26
label_3      23
label_9      22
label_8      15
label_0      15
label_6      14
label_4      13
label_7      12
label_2      10
label_1       9
Name: Omit, dtype: int64

# Step 4: Training Model  (repeat after step 3)

In [54]:
df_train = df.copy()
one_dim = x_test.reshape(x_test.shape[0], 28*28)
df_train['Features'] = [one_dim[i].tolist() for i in range(x_test.shape[0])]
df_tmp = df_train[df_train.Omit != -1]
df_tmp = df_train[df_train.Omit != -1]
X = df_tmp.Features.values.tolist()
Y = df_tmp.Omit.apply(lambda x: x).values
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=50)
clf = clf.fit(X, Y)
p_arr = clf.predict_proba(df_train.Features.values.tolist())
_labels = clf.predict(df_train.Features.values.tolist())
zero_prob = [x[0] for x in p_arr]
one_prob = [x[1] for x in p_arr]
two_prob = [x[2] for x in p_arr]
three_prob = [x[3] for x in p_arr]
four_prob = [x[4] for x in p_arr]
five_prob = [x[5] for x in p_arr]
six_prob = [x[6] for x in p_arr]
seven_prob = [x[7] for x in p_arr]
eight_prob = [x[8] for x in p_arr]
nine_prob = [x[9] for x in p_arr]
df = pd.DataFrame()
df['Id'] = [x for x in range(len(x_test))]
df['Labels'] = _labels
df['ProbOne'] = one_prob
df['ProbZeros'] = zero_prob
df['ProbTwos'] = two_prob
df['ProbThrees'] = three_prob
df['ProbFours'] = four_prob
df['ProbFives'] = five_prob
df['ProbSixs'] = six_prob
df['ProbSevens'] = seven_prob
df['ProbEights'] = eight_prob
df['ProbNines'] = nine_prob
df['IsTraining'] = df_train.Omit

# Step 5: Visualize With new Predictions (repeat after step 4)
* Improve the predictions by taging more examples

In [55]:
jsonstr = df.to_json(orient='records')

fc = Facets()
fc.create_classes(labels=['label_' + str(x) for x in range(0,10)])
fc.define_atlas(df, sprite_width=28, sprite_height=28, atlas_url='atlas.jpg')
fc.render_html('testing.html')

# [Click to Label](testing.html)
* Sort and arrange data by label <img src="screenshots/arrange_data.png">
* Label Images By Clicking
* The sorting will help to find better examples and find where model is making mistakes
* Have 7-9 good examples for each digits
* Must have at least one good example for each digit for this notebook to work
