In [1]:
%matplotlib inline
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cross_validation import train_test_split

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC, SVC
from sklearn.grid_search import GridSearchCV
from sklearn.tree import tree

from sklearn.grid_search import ParameterGrid

In [2]:
def unpickle(file):
    import cPickle
    fo = open(file, 'rb')
    dict = cPickle.load(fo)
    fo.close()
    return dict

In [3]:
train = unpickle('train')
test = unpickle('test')
meta = unpickle('meta')

In [4]:
print train.keys()
print meta.keys()

['data', 'batch_label', 'fine_labels', 'coarse_labels', 'filenames']
['fine_label_names', 'coarse_label_names']


In [5]:
print 'CLASSES'
print meta['fine_label_names']
print 'SUPERCLASSES'
print meta['coarse_label_names']

CLASSES
['apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle', 'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel', 'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock', 'cloud', 'cockroach', 'couch', 'crab', 'crocodile', 'cup', 'dinosaur', 'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster', 'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion', 'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse', 'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear', 'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine', 'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose', 'sea', 'seal', 'shark', 'shrew', 'skunk', 'skyscraper', 'snail', 'snake', 'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table', 'tank', 'telephone', 'television', 'tiger', 'tractor', 'train', 'trout', 'tulip', 'turtle', 'wardrobe', 'whale', 'willo

In [6]:
print 'Nombre de données: ', train['data'].shape[0]
print 'Taille d une donnée: ', train['data'].shape[1]
print 'Nombre de classes: ', len(set(train['fine_labels']))
print 'Nombre de super-classes: ', len(set(train['coarse_labels']))

Nombre de données:  50000
Taille d une donnée:  3072
Nombre de classes:  100
Nombre de super-classes:  20


aquatic mammals	 dolphin, otter, seal, whale
large carnivores   leopard, lion, tiger, wolf
small mammals	hamster, mouse, rabbit, squirrel

In [21]:
#train_flabels_list = ['dolphin','otter','seal','leopard','lion','wolf']
train_flabels_list = ['seal','leopard','hamster']
test_flabels_list = ['whale','tiger','rabbit']

In [22]:
train_flabels = np.asarray(train['fine_labels'])
test_flabels = np.asarray(test['fine_labels'])

In [23]:
train_id = train_flabels == 1000
for lab in train_flabels_list:
    train_id += train_flabels == meta['fine_label_names'].index(lab)

X_train = train['data'][train_id]
Y_train = np.asarray(train['coarse_labels'])[train_id]

In [24]:
test_id = test_flabels == 1000
for lab in test_flabels_list:
    test_id += test_flabels == meta['fine_label_names'].index(lab)

X_test = test['data'][test_id]
Y_test = np.asarray(test['coarse_labels'])[test_id]

In [25]:
print 'Nombre de données d entrainement: ', X_train.shape[0]
print 'Taille d une donnée: ', X_train.shape[1]
print 'Nombre de labels: ', len(set(Y_train))

Nombre de données d entrainement:  1500
Taille d une donnée:  3072
Nombre de labels:  3


In [26]:
print 'Nombre de données de test: ', X_test.shape[0]
print 'Taille d une donnée: ', X_test.shape[1]
print 'Nombre de labels: ', len(set(Y_test))

Nombre de données de test:  300
Taille d une donnée:  3072
Nombre de labels:  3


In [27]:
param_grid = {'max_depth': [10, 100,1000,10000], 'max_features': [10, 100, 1000]}

clf = tree.DecisionTreeClassifier()

best_score =0
for g in ParameterGrid(param_grid):
    clf.set_params(**g)
    clf.fit(X_train,Y_train)
    score = clf.score(X_test,Y_test)
    if score > best_score:
        best_score = score
        best_grid = g

print "score: %0.5f" % best_score 
print "Grid:", best_grid

score: 0.53000
Grid: {'max_features': 1000, 'max_depth': 10}


In [28]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')

In [29]:
clf.score(X_test,Y_test)

0.49666666666666665

In [30]:
clf = LinearSVC()
clf.fit(X_train,Y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [31]:
clf.score(X_test,Y_test)

0.5

In [32]:
param_grid = {'C': [0.1, 1,10,100,1000,10000]}

clf = LinearSVC()

best_score =0
for g in ParameterGrid(param_grid):
    clf.set_params(**g)
    clf.fit(X_train,Y_train)
    score = clf.score(X_test,Y_test)
    if score > best_score:
        best_score = score
        best_grid = g

print "score: %0.5f" % best_score 
print "Grid:", best_grid

score: 0.50667
Grid: {'C': 0.1}


In [18]:
clf = SVC()
clf.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [19]:
clf.score(X_test,Y_test)

0.33000000000000002