# Photo to Mood

画像データについて、GracenoteのどのMoodに相当するかを判断するModelを作成します


In [1]:
# グラフが文章中に表示されるようにするおまじない
%matplotlib inline

# autoreload module
%load_ext autoreload
%autoreload 2

# load local package
import sys
import os
current_path = os.getcwd()
sys.path.append(os.path.join(current_path, "../../"))  # load project root


## Load the Data

image_url、moodを左端に設定したファイルから学習データを読み込みます。
なお、今回値はRekognitionのスコアであり、全項目同じ範囲の値のため正規化は行いません。

In [2]:
import os
import numpy as np

data_file = os.path.join(current_path, "./data/training_data.txt")
ignore_column = 1
header = []
data = None

with open(data_file, "rb") as f:
    header = f.readline().decode("utf-8").split()
    data = np.genfromtxt(f, invalid_raise=False, usecols=range(ignore_column, len(header)))
    
X = data[:, 1:]
y = data[:, 0]
header = header[(ignore_column + 1):] # ignore column + y column

print(header)
print(X.shape)
print(y.shape)


['alcohol', 'aluminium', 'animal', 'apartment_building', 'art', 'auto', 'baby', 'ball', 'balloon', 'banister', 'beach', 'bean_sprout', 'bed', 'bedroom', 'beverage', 'bicycle', 'bike', 'binder', 'bird_feeder', 'black_hair', 'blanket', 'blossom', 'bobsled', 'bonfire', 'bowl', 'building', 'bulb', 'bulldog', 'bus', 'cable_car', 'campfire', 'camping', 'canopy', 'car', 'car_seat', 'cardboard', 'cash_machine', 'child', 'citrus', 'clothing', 'cloud', 'coast', 'coast_guard', 'coat_rack', 'coca', 'cockpit', 'coconut', 'coffee_cup', 'coffee_table', 'collage', 'collection', 'concrete', 'condo', 'cone', 'coral_reef', 'corner', 'crash_helmet', 'crowd', 'cumulus', 'cup', 'curtain', 'daisy', 'dance', 'dark_hair', 'deck', 'desk', 'diary', 'diner', 'dinner', 'ditch', 'dog', 'drain', 'drink', 'drinking', 'eating', 'emblem', 'enclosure', 'face', 'factory', 'female', 'field', 'figurine', 'fire', 'firework', 'fitness', 'flare', 'flower', 'food', 'football', 'footwear', 'forest', 'fountain', 'freeway', 'frui

    Line #8 (got 1 columns instead of 192)
    Line #14 (got 1 columns instead of 192)


## Create the Model

今回扱うのは画像の分類問題になります。そこで、分類問題でよく使われるSupport Vector Machineを利用します。  
特徴量の数が多いため、有効なものに限って使用します。

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

feature_count = 20
get_headers = lambda s: [i_h[1] for i_h in enumerate(header) if s[i_h[0]]]

selector = SelectKBest(f_classif, k=feature_count).fit(X, y)
selected = selector.get_support()
kbests = sorted(zip(get_headers(selected), selector.scores_[selected]), key=lambda h_s: h_s[1], reverse=True)
print(kbests)

[('plant', 8.935300536178568), ('flower', 6.0384016976939439), ('blossom', 2.5354977336732385), ('tree', 2.4851731841401596), ('coat_rack', 2.4139941690962101), ('cone', 2.4139941690962101), ('fountain', 2.4139941690962101), ('lighting', 2.4139941690962101), ('sky', 2.4139941690962101), ('sea', 2.4139941690962097), ('sunset', 2.4139941690962097), ('bulb', 2.4139941690962092), ('food', 1.8761871130417345), ('table', 1.8658252169069671), ('night', 1.771587036848467), ('vehicle', 1.6630071299478522), ('outdoors', 1.4693796073922025), ('ball', 1.4396028681742967), ('balloon', 1.4396028681742965), ('hang_out', 1.4318165684838651)]


## Training the Model

データとモデルがそろったため、学習させてみます。  
パラメーターはGrid Searchで探索します。

In [12]:
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.externals import joblib
from sklearn import svm

X_c = X[:, selected]

x_train, x_test, y_train, y_test = train_test_split(X_c, y, test_size=0.25, random_state=42)

candidates = [{'kernel': ['linear'], 'C': [1, 10, 100]}]

clf = GridSearchCV(svm.SVC(C=1), candidates, cv=2, scoring="f1")
clf.fit(x_train, y_train)

for params, mean_score, scores in sorted(clf.grid_scores_, key=lambda s: s[1], reverse=True):
    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

columns = get_headers(selected)
model = clf.best_estimator_

y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))


0.255 (+/-0.013) for {'kernel': 'linear', 'C': 100}
0.222 (+/-0.011) for {'kernel': 'linear', 'C': 10}
0.140 (+/-0.006) for {'kernel': 'linear', 'C': 1}
             precision    recall  f1-score   support

        0.0       0.00      0.00      0.00         5
        1.0       0.26      1.00      0.41         6
        2.0       0.00      0.00      0.00         3
        3.0       0.00      0.00      0.00         3
        4.0       0.00      0.00      0.00         3
        5.0       0.00      0.00      0.00         1
        6.0       0.00      0.00      0.00         3
        7.0       0.00      0.00      0.00         1

avg / total       0.06      0.24      0.10        25



  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)
  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)
  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)
  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)
  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)
  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Store the Model

最後に、学習させたモデルを保存します。アプリケーション側で、その結果を確認してみてください。

In [14]:
from sklearn.externals import joblib

print(columns)
joblib.dump(model, "./machine.pkl") 


['ball', 'balloon', 'blossom', 'bulb', 'coat_rack', 'cone', 'flower', 'food', 'fountain', 'hang_out', 'lighting', 'night', 'outdoors', 'plant', 'sea', 'sky', 'sunset', 'table', 'tree', 'vehicle']


['./machine.pkl',
 './machine.pkl_01.npy',
 './machine.pkl_02.npy',
 './machine.pkl_03.npy',
 './machine.pkl_04.npy',
 './machine.pkl_05.npy',
 './machine.pkl_06.npy',
 './machine.pkl_07.npy',
 './machine.pkl_08.npy',
 './machine.pkl_09.npy',
 './machine.pkl_10.npy',
 './machine.pkl_11.npy']