### Feature Extraction with a Pretrained Model

Implemented with GraphLab Create

In [None]:
WORKING_DIR = '/home/ubuntu/kaggle/NaiveBees'
import graphlab as gl
import os

In [2]:
train_dir = WORKING_DIR + '/train'
test_dir = WORKING_DIR + '/test'

In [3]:
train_sf = gl.image_analysis.load_images(train_dir, random_order=True)

In [4]:
train_sf['image'] = gl.image_analysis.resize(train_sf['image'], 256, 256)

In [5]:
train_sf['id'] = train_sf['path'].apply(lambda x: x.split('/')[-1])

In [6]:
train_sf['id'] = train_sf['id'].apply(lambda x: int(x.split('.')[-2]))

In [7]:
labels_sf = gl.SFrame.read_csv(WORKING_DIR + '/train.csv')

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[int,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [8]:
train = train_sf.join(labels_sf, on='id')

In [None]:
#mean_image = train['image'].mean()

In [9]:
deep_learning_model = gl.load_model(
    'http://s3.amazonaws.com/GraphLab-Datasets/deeplearning/imagenet_model_iter45')

In [10]:
train['deep_features'] = deep_learning_model.extract_features(train)

In [11]:
train['genus'] = train['genus'].apply(lambda x: int(x))

In [12]:
train_sf, val_sf = train.random_split(0.85)

In [13]:
deep_features_model = graphlab.logistic_classifier.create(train_sf, features=['deep_features'],target='genus',validation_set=None)

In [19]:
results = deep_features_model.evaluate(val_sf, metric='auto')

In [20]:
results

{'accuracy': 0.8626465661641541,
 'auc': 0.9234082024779705,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      1       |        0        |   24  |
 |      0       |        1        |   58  |
 |      0       |        0        |   71  |
 |      1       |        1        |  444  |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.9154639175257733,
 'log_loss': 0.2907253868067259,
 'precision': 0.8844621513944223,
 'recall': 0.9487179487179487,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+-----+-----+
 | threshold | fpr | tpr |  p  |  n  |
 +-----------+-----+-----+-----+-----+
 |    0.0    | 1.0 | 1.0 | 468 | 129 |
 |   1e-05   | 1.0 | 1.0 | 468 | 129 |
 |   2e-05   | 1.0 | 1.0 | 46

In [16]:
graphlab.canvas.set_target('ipynb')
deep_features_model.show(view='Evaluation')

ROC Curve

![ROC curve](https://s3-us-west-1.amazonaws.com/juandoso-naivebees1/line_False+Positive+Rate_True+Positive+Rate.png "ROC Curve")

In [None]:
test_sf = gl.image_analysis.load_images(test_dir, random_order=True)
test_sf['image'] = gl.image_analysis.resize(test_sf['image'], 256, 256)
test_sf['id'] = test_sf['path'].apply(lambda x: x.split('/')[-1])
test_sf['id'] = test_sf['id'].apply(lambda x: int(x.split('.')[-2]))

In [None]:
test_sf['deep_features'] = deep_learning_model.extract_features(test_sf)

In [None]:
deep_features_model = graphlab.logistic_classifier.create(train, features=['deep_features'],target='genus', validation_set=None)

In [None]:
predictions = deep_features_model.predict(test_sf,  output_type='probability')

In [None]:
predictions

In [None]:
test = test_sf.add_column(predictions, name='genus')

In [None]:
submission_format = gl.SFrame.read_csv(WORKING_DIR + '/SubmissionFormat.csv')

In [None]:
submission = submission_format['id'].join(test['id','genus'], on={'id':'id'}, how='inner')

In [None]:
submission

In [None]:
submission.export_csv(WORKING_DIR+'/submission_deepfeatures_logisticregression.csv')