In [1]:
%matplotlib inline
from pycocotools.coco import COCO
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab


In [2]:
dataDir='/home/kent/git/aiacademy-learning-notebook/Project/ImageCaption/data/coco'
dataType='train2014'
annFile='{}/annotations/instances_{}.json'.format(dataDir,dataType)

print(annFile)
!ls -alh $annFile

/home/kent/git/aiacademy-learning-notebook/Project/ImageCaption/data/coco/annotations/instances_train2014.json
-rw-rw-r-- 1 kent kent 318M Sep  1 18:56 /home/kent/git/aiacademy-learning-notebook/Project/ImageCaption/data/coco/annotations/instances_train2014.json


In [3]:
# initialize COCO api for instance annotations
coco=COCO(annFile)

loading annotations into memory...
Done (t=13.91s)
creating index...
index created!


In [4]:
# display COCO categories and supercategories
cats = coco.loadCats(coco.getCatIds())
nms=[cat['name'] for cat in cats]
print('COCO categories: \n{}\n'.format(' '.join(nms)))

nms = set([cat['supercategory'] for cat in cats])
print('COCO supercategories: \n{}'.format(' '.join(nms)))

COCO categories: 
person bicycle car motorcycle airplane bus train truck boat traffic light fire hydrant stop sign parking meter bench bird cat dog horse sheep cow elephant bear zebra giraffe backpack umbrella handbag tie suitcase frisbee skis snowboard sports ball kite baseball bat baseball glove skateboard surfboard tennis racket bottle wine glass cup fork knife spoon bowl banana apple sandwich orange broccoli carrot hot dog pizza donut cake chair couch potted plant bed dining table toilet tv laptop mouse remote keyboard cell phone microwave oven toaster sink refrigerator book clock vase scissors teddy bear hair drier toothbrush

COCO supercategories: 
person electronic accessory animal food furniture sports indoor outdoor vehicle kitchen appliance


In [5]:
coco.dataset['info']

{'contributor': 'COCO Consortium',
 'date_created': '2017/09/01',
 'description': 'COCO 2014 Dataset',
 'url': 'http://cocodataset.org',
 'version': '1.0',
 'year': 2014}

In [6]:
coco.dataset.keys()

dict_keys(['categories', 'licenses', 'annotations', 'images', 'info'])

In [7]:
# initialize COCO api for caption annotations
annFile = '{}/annotations/captions_{}.json'.format(dataDir,dataType)
coco_caps=COCO(annFile)

loading annotations into memory...
Done (t=0.99s)
creating index...
index created!


# 載入原始資料

In [8]:
train_set =[]
for img in coco.dataset['images']:
#     print(img)
#     print(img['file_name'])
    
    imgId = img['id']
    
    annIds = coco_caps.getAnnIds(imgIds=imgId)
    anns = coco_caps.loadAnns(annIds)
    
    anns_list = []
    for aann in anns:
#         print(aann['caption'])
        anns_list.append(aann['caption'])
        
    train_set.append((img['file_name'], anns_list))

len(train_set)

82783

# 進行翻譯

In [9]:
import google.auth

credentials, project = google.auth.default()

In [10]:
# Imports the Google Cloud client library
from google.cloud import translate


translate_client = translate.Client(credentials=credentials)

def translate(query):
    target='zh_TW'
    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(
        query,
        target_language=target)

#     print(u'Text: {}'.format(result['input']))
#     print(u'Translation: {}'.format(result['translatedText']))
#     print(u'Detected source language: {}'.format(
#         result['detectedSourceLanguage']))

    return result['translatedText']
translate("good job")

'做得好'

In [11]:
import time
import random
def doTranslate(x):
    retry = 1
    while True:
        try:
            fname,slist = x
            smerge = " @@ ".join(slist)
        #     print(smerge)
            translation = translate_client.translate(
                smerge,
                target_language= 'zh_tw')    
            tslist = translation['translatedText'].split("@@")
            time.sleep(1)
            return (fname,tslist)
        except:
            time.sleep(random.randint(20,30*retry))
            slack_log("retry:{}"%(retry))
            print('*',end='')
            retry += 1
        

In [None]:
from slacker import Slacker
import os
slack = Slacker(os.environ['SLACK_TOKEN'])
def slack_log(message):
    try:
        slack.chat.post_message('#log-traininglog', "Translate {}".format(message))
    except:
        print("slack connection error")

slack_log('Initial Logger')   

In [None]:
import time 
from joblib import Parallel, delayed 
# List of arguments to pass to work():
# Anything returned by work() can be stored:
results = Parallel(n_jobs=3, verbose=2, backend="threading")(map(delayed(doTranslate), train_set))

[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   12.2s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:   52.8s
[Parallel(n_jobs=3)]: Done 359 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed:  3.6min
[Parallel(n_jobs=3)]: Done 1007 tasks      | elapsed:  5.7min
[Parallel(n_jobs=3)]: Done 1452 tasks      | elapsed:  8.2min
[Parallel(n_jobs=3)]: Done 1979 tasks      | elapsed: 11.2min
[Parallel(n_jobs=3)]: Done 2586 tasks      | elapsed: 14.6min
[Parallel(n_jobs=3)]: Done 3275 tasks      | elapsed: 18.5min
[Parallel(n_jobs=3)]: Done 4044 tasks      | elapsed: 22.8min
[Parallel(n_jobs=3)]: Done 4895 tasks      | elapsed: 27.6min
[Parallel(n_jobs=3)]: Done 5826 tasks      | elapsed: 35.7min
[Parallel(n_jobs=3)]: Done 6839 tasks      | elapsed: 44.8min
[Parallel(n_jobs=3)]: Done 7932 tasks      | elapsed: 54.5min
[Parallel(n_jobs=3)]: Done 9107 tasks      | elapsed: 65.1min
[Parallel(n_jobs=3)]: Done 10362 tasks      | elapsed: 76.3min
[Parallel(n

In [None]:
results

In [18]:
import pickle
pickle.dump(results,open("./data/coco/chinease_caption.pk",'wb'))

In [21]:
results[600]

('COCO_train2014_000000017153.jpg',
 ['沙灘上的一張毯子上放著兩個比薩餅，蔬菜和蘸醬，以及其他食物@ @野餐午餐和比薩餅和餃子坐在被子上。 ',
  '兩個比薩餅坐在蔬菜旁邊，沾上甜點。 ',
  '各種各樣的食物，包括毯子上的披薩和餃子。 ',
  '一箱兩個比薩餅和一些某種包裝好的蔬菜。'])

In [16]:
!ls

~						       DoPrediction.ipynb
BuildModel.ipynb				       image_list.json
caption_indexed.pk				       MonitorGPU.ipynb
CoCoDatasetPrepare-MakeCapationFeature-Chinease.ipynb  mytokenizer.pk
CoCoDatasetPrepare-MakeCapationFeature.ipynb	       OpenImage.ipynb
CoCoDatasetPrepare-MakeImageFeature.ipynb	       pycocoDemo.ipynb
coco_model_ep_1.h5				       SimpleTrial
coco_model_ep_20.h5				       tokenizer.pk
coco_model_ep_7.h5				       train.json
coco_model_ep_9.h5				       vgg16_feature.hdf5
data


# Word2Vec Model Download

http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/