Deep Learning
=============

Assignment 1
------------

The objective of this assignment is to learn about simple data curation practices, and familiarize you with some of the data we'll be reusing later.

This notebook uses the [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) dataset to be used with python experiments. This dataset is designed to look like the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset, while looking a little more like real data: it's a harder task, and the data is a lot less 'clean' than MNIST.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.

from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from PIL import Image
from sklearn import cross_validation
# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

train_filename = 'train'
test_filename = 'test'

image_size_w = 128  # Pixel width and height.
image_size_h = 78
size = 128, 128
pixel_depth = 255.0  # Number of levels per pixel.



First, we'll download the dataset to our local machine. The data consists of characters rendered in a variety of fonts on a 28x28 image. The labels are limited to 'A' through 'J' (10 classes). The training set has about 500k and the testset 19000 labelled examples. Given these sizes, it should be possible to train models quickly on any machine.

Extract the dataset from the compressed .tar.gz file.
This should give you a set of directories, labelled A through J.

In [2]:
num_classes = 8
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]
  if len(data_folders) != num_classes:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = ['test\\test_stg1']

train already present - Skipping extraction of train.
['train\\ALB', 'train\\BET', 'train\\DOL', 'train\\LAG', 'train\\NoF', 'train\\OTHER', 'train\\SHARK', 'train\\YFT']


In [3]:
import glob, os

def load_letter(folder, min_num_images):
  filelist = glob.glob(folder+"\*.thumbnail")
  for f in filelist:
    os.remove(f)
    
    
  """Load the data for a single letter label."""
  image_files = os.listdir(folder)
  print(len(image_files))
  dataset = np.ndarray(shape=(len(image_files), image_size_h, image_size_w),
                         dtype=np.float32)
  print(folder)
  num_images = 0
  file_names=[]
  for image in image_files:
    image_file = os.path.join(folder, image)
    outfile = image_file + ".thumbnail"
    im = Image.open(image_file)
    
    horizontal_padding = (1300 - im.size[0]) / 2
    vertical_padding = (800 - im.size[1]) / 2
    img1 = im.crop(
        (
            -horizontal_padding,
            -vertical_padding,
            im.size[0] + horizontal_padding,
            im.size[1] + vertical_padding
        )
    )
    img1.thumbnail(size, Image.ANTIALIAS)
    img1.save(outfile, "JPEG")
    try:
      image_data = (ndimage.imread(outfile,flatten=True).astype(float) - 
                    pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size_h, image_size_w):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[num_images, :, :] = image_data
      num_images = num_images + 1
    except IOError as e:
      print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    file_names.append(image)
    
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
  print('Full dataset tensor:', dataset.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  print(dataset.shape)
  return dataset, file_names
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
  dataset_names = []
  image_names = []
  print(data_folders)
  for folder in data_folders:
    set_filename = folder + '.pickle'
    dataset_names.append(set_filename)
    if os.path.exists(set_filename) and not force:
      # You may override by setting force=True.
      print('%s already present - Skipping pickling.' % set_filename)
    else:
      print('Pickling %s.' % set_filename)
      dataset, file_names = load_letter(folder, min_num_images_per_class)
      image_names.extend(file_names)         
      try:
        with open(set_filename, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      except Exception as e:
        print('Unable to save data to', set_filename, ':', e)
  
  return dataset_names, image_names

train_datasets, train_images = maybe_pickle(train_folders, 67)
test_datasets, test_images = maybe_pickle(test_folders, 353)

['train\\ALB', 'train\\BET', 'train\\DOL', 'train\\LAG', 'train\\NoF', 'train\\OTHER', 'train\\SHARK', 'train\\YFT']
Pickling train\ALB.pickle.
1719
train\ALB
Full dataset tensor: (1719, 78, 128)
Mean: -0.136433
Standard deviation: 0.230907
(1719, 78, 128)
Pickling train\BET.pickle.
200
train\BET
Full dataset tensor: (200, 78, 128)
Mean: -0.129147
Standard deviation: 0.233692
(200, 78, 128)
Pickling train\DOL.pickle.
117
train\DOL
Full dataset tensor: (117, 78, 128)
Mean: -0.143655
Standard deviation: 0.243055
(117, 78, 128)
Pickling train\LAG.pickle.
67
train\LAG
Full dataset tensor: (67, 78, 128)
Mean: -0.123839
Standard deviation: 0.222084
(67, 78, 128)
Pickling train\NoF.pickle.
465
train\NoF
Full dataset tensor: (465, 78, 128)
Mean: -0.126385
Standard deviation: 0.226744
(465, 78, 128)
Pickling train\OTHER.pickle.
299
train\OTHER
Full dataset tensor: (299, 78, 128)
Mean: -0.148818
Standard deviation: 0.215283
(299, 78, 128)
Pickling train\SHARK.pickle.
176
train\SHARK
Full dataset

In [4]:
print(test_datasets)

['test\\test_stg1.pickle']


In [5]:
def merge_datasets(pickle_files):    
    
  dataset_arr = np.empty(shape=(0,image_size_h,image_size_w))
  dataset_label = np.array([])
  fish_names=[]
  for label, pickle_file in enumerate(pickle_files):
    try:
      with open(pickle_file, 'rb') as f:
        fish_set = pickle.load(f)
        print(fish_set.shape)
        n_rows = fish_set.shape[0]
        label_set=np.full(shape=(n_rows,1),fill_value=label)
        dataset_arr=np.append(dataset_arr,fish_set,axis=0)
        dataset_label=np.append(dataset_label,label_set)
        fish_names.append(pickle_file)    



    except Exception as e:
      print('Unable to process data from', pickle_file, ':', e)
      raise
    
  return dataset_arr, dataset_label, fish_names
            
            
train_dataset, train_labels, fish_names_train = merge_datasets(train_datasets)
test_dataset, test_labels, fish_names_test = merge_datasets(test_datasets)
#test_dataset_flat=test_dataset.reshape(test_dataset.shape[0],78*128)
#print('Training:', train_dataset.shape, train_labels.shape)
#print('Validation:', valid_dataset.shape, valid_labels.shape)
#print('Testing:', test_dataset.shape, test_labels.shape)

(1719, 78, 128)
(200, 78, 128)




(117, 78, 128)
(67, 78, 128)
(465, 78, 128)




(299, 78, 128)
(176, 78, 128)




(734, 78, 128)
(1000, 78, 128)




In [6]:
from sklearn.model_selection import StratifiedShuffleSplit
#train_dataset_flat=train_dataset.reshape(train_dataset.shape[0],78*128)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(train_dataset, train_labels):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_val = train_dataset[train_index,:,:], train_dataset[test_index,:,:]
    y_train, y_val = train_labels[train_index], train_labels[test_index]

TRAIN: [ 856 1300 3351 ..., 2919 2886  669] TEST: [2867 1636 1020 1331 3471  909  952 1621 2032  180 3379 2289  684  859 2870
 3392 3434 3457  972 1905  387 3174 2708 1539 3362 2746 2554 1447 2159 2022
 3202 1371  537 3070  488 1241 1138 3198  139  388  123 2728 3700 1778   36
  637  423 3199 2649 1152 1947 2977 3093 2448 1221 3492 3002  978 1757  323
  274 1198 2097 3234 2771 2711  770  121 2740 2262 3282 3247 2446  498  510
 1808  290 1094 2098 3360 1868 2978 3646  324 2134 1584 3042 2308 3558 1509
 3652 1828 3079 2325  291 1641  872 3030 1640 2899 3107 1589 2568 2612  701
 3241 2181 3427 3513 3691 1204 3353 1040 1162 2674 3048 3607 3259 1979 1272
  774 1843 1675 1441 2104 3150 1643 3405 1563  756 1496 3519 1873 1776  868
 3140   84 1382 2961 3254  730 1466 1383  600 1946 2323 2223  146 2654 1033
  894 3220  763 3686  256 2193 3303 3515 1922 1278 3411 1782 1653 3058 1925
 2146  556 2545 1791 1395 3018 1216 3277 1570  791 3243 1400  705 2965 1455
 2044  559 1284 2504 2421  860 2950  4

In [7]:
print(X_train.shape)
print(X_val.shape)
print(test_dataset.shape)
print(y_train.shape)
print(y_val.shape)

(3021, 78, 128)
(756, 78, 128)
(1000, 78, 128)
(3021,)
(756,)


In [8]:
pickle_file = 'thumbnail_78_128_12Dec.pickle'

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_datasets': X_train,
    'train_labels': y_train,
    'val_datasets': X_val,
    'val_labels': y_val,
    'test_datasets' : test_dataset,
    'test_images' : test_images,
    'fish_names_train' : fish_names_train,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise