In [59]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time, datetime
import sklearn, sklearn.metrics, sklearn.decomposition
import collections
import matplotlib.pyplot as plt
import seaborn
from collections import Counter

dev_batch_format = 'D:/Jupyter/DataSets/prv/mobnet_batch{0:03d}.npy'
dev_batches = range(101)


def read_batches(batches):
    res = []
    for x in batches:
        try:
            res.append(np.load(dev_batch_format.format(x)))
        except:
            print('failed load')
    return np.vstack(res)

def split_full(full):
    return full[:, 2:], full[:,0], full[:,1]


In [3]:
dev0_X, dev0_YM, dev0_YF = split_full(read_batches(dev_batches))
dev_X =dev0_X[~np.isnan(dev0_YF)]
dev_YM =dev0_YM[~np.isnan(dev0_YF)]
dev_YF =dev0_YF[~np.isnan(dev0_YF)]
rndval = np.random.uniform(size=len(dev_X))
rndcut = 0.9
train_X = dev_X[rndval < rndcut]
train_YF = dev_YF[rndval < rndcut]
train_YM = dev_YM[rndval < rndcut]
test_X = dev_X[rndval >= rndcut]
test_YF = dev_YF[rndval >= rndcut]
test_YM = dev_YM[rndval >= rndcut]

In [4]:
def build_fcnn_graph(input_shape, fc_arch, num_classes):
    tf.reset_default_graph()

    with tf.name_scope('Input'):
        tf_in_x = tf.placeholder(tf.float32, shape=(None, input_shape))
        tf_in_y = tf.placeholder(tf.int32, shape=(None,))

    tf_temp = tf_in_x
    
    with tf.name_scope('FC'):
        for sz in fc_arch:
            tf_temp = tf.layers.dense(tf_temp, sz, activation=tf.nn.elu)
            
        tf_final = tf.layers.dense(tf_temp, num_classes)
        tf_prob = tf.nn.softmax(tf_final)
        tf_predicted = tf.cast(tf.argmax(tf_prob, axis=1), dtype=tf.int32)

    with tf.name_scope('LOSS'):
        tf_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_y, logits=tf_final))
        tf_train = tf.train.AdamOptimizer(1e-3).minimize(tf_loss)
        
        tf_rocauc, tf_upd_rocuac = tf.metrics.auc(labels=tf_in_y, predictions=tf_prob[:,1], num_thresholds=10000)
        tf_gini = tf_rocauc * 2 - 1
        tf_accuracy, tf_upd_accuracy = tf.metrics.accuracy(labels=tf_in_y, predictions=tf_predicted)
        tf_update_metrics = tf.group(tf_upd_rocuac, tf_upd_accuracy)
        
        tfsummary_logloss = tf.summary.scalar('Log-Loss', tf_loss)
        tfsummary_gini = tf.summary.scalar('1-Gini', 1-tf_gini)
        tfsummary_accuracy = tf.summary.scalar('1-Accuracy', 1-tf_accuracy)
        tfsummary = tf.summary.merge([tfsummary_logloss, tfsummary_gini, tfsummary_accuracy])

    return {'in':{'data':tf_in_x, 'label':tf_in_y},
            'out':{'logit':tf_final, 'prob':tf_prob},
            'run':{'loss': tf_loss, 'upd_metrics':tf_update_metrics,
                   'gini':tf_gini, 'accuracy':tf_accuracy,
                   'train': tf_train, 'summary':tfsummary}}

In [31]:
graph_descr = build_fcnn_graph(1001, [100, 20], 2)
model_name = '25EasyFCNN'

tffw_graph = tf.summary.FileWriter('D:/Jupyter/Logs/Graph_{}'.format(model_name), tf.get_default_graph())
model_ckpt_name = '../Models/{0}/model'.format(model_name)+'-{:02d}.ckpt'

print('Graph created')

batch_steps = 1
batch_size  = 64
calc_batch_size = 2048

train_set = (train_X, train_YF.astype(np.int32))
valid_set = (test_X, test_YF.astype(np.int32))

set2dict = lambda x: {graph_descr['in']['data']: x[0], graph_descr['in']['label']: x[1]}

stat_set_train = (train_X[:len(test_YF)], train_YF[:len(test_YF)])
stat_set_valid = valid_set

stat_train_dict = set2dict(stat_set_train)
stat_valid_dict = set2dict(stat_set_valid)
print('Preparation complete')

Graph created
Preparation complete


In [32]:
num_epochs = 50

dt_now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
tffw_train = tf.summary.FileWriter('D:/Jupyter/Logs/Run_{0}-{1}-T'.format(model_name, dt_now), tf.get_default_graph())
tffw_valid = tf.summary.FileWriter('D:/Jupyter/Logs/Run_{0}-{1}-V'.format(model_name, dt_now), tf.get_default_graph())
tfsSaver = tf.train.Saver(max_to_keep=5)

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    tfs.run(tf.local_variables_initializer())
    
    for n in range(num_epochs):
        t0 = time.perf_counter()
        
        modutils.runEpoch(tfs, train_set, batch_size, set2dict, graph_descr['run']['train'],
                     op_loss=graph_descr['run']['loss'], verbatim=True)
        
        tfs.run(graph_descr['run']['upd_metrics'], stat_train_dict)
        train_stats = tfs.run([graph_descr['run']['loss'], graph_descr['run']['gini'],
                             graph_descr['run']['accuracy'], graph_descr['run']['summary']], stat_train_dict)
        tffw_train.add_summary(train_stats[-1], n)
        
        tfs.run(graph_descr['run']['upd_metrics'], stat_valid_dict)
        valid_stats = tfs.run([graph_descr['run']['loss'], graph_descr['run']['gini'],
                             graph_descr['run']['accuracy'], graph_descr['run']['summary']], stat_valid_dict)
        tffw_valid.add_summary(valid_stats[-1], n)
        
        t1 = time.perf_counter()
        
        p = tfsSaver.save(tfs, model_ckpt_name.format(n))
        print('Model saved at checkpoint: {0}'.format(p))        
        print('Epoch {0}: {1:.3f} in {2:.2f} sec, gini={3:.3f}, accur={4:.3f}'.format(n, valid_stats[0], t1-t0,
                                                                                      valid_stats[1], valid_stats[2]))
print('\nDone')



Model saved at checkpoint: ../Models/25EasyFCNN/model-00.ckpt
Epoch 0: 0.188 in 9.97 sec, gini=0.794, accur=0.929
Model saved at checkpoint: ../Models/25EasyFCNN/model-01.ckpt
Epoch 1: 0.168 in 10.31 sec, gini=0.806, accur=0.930
Model saved at checkpoint: ../Models/25EasyFCNN/model-02.ckpt
Epoch 2: 0.192 in 9.98 sec, gini=0.806, accur=0.928
Model saved at checkpoint: ../Models/25EasyFCNN/model-03.ckpt
Epoch 3: 0.164 in 9.69 sec, gini=0.817, accur=0.930


KeyboardInterrupt: 

In [39]:
tfsSaver = tf.train.Saver(max_to_keep=5)
with tf.Session() as tfs:
    tfsSaver.restore(tfs, '../Models/25EasyFCNN/model-03.ckpt')
    #runDataset(tfs, calc_set, batch_size, set2feeddict, ops):
    dev0_P0 = modutils.runDataset(tfs, (dev0_X,), 512,
                       lambda x: {graph_descr['in']['data']: x[0]},
                       graph_descr['out']['prob'])
    train_P0 = modutils.runDataset(tfs, (train_X,), 512,
                       lambda x: {graph_descr['in']['data']: x[0]},
                       graph_descr['out']['prob'])
    test_P0 = modutils.runDataset(tfs, (test_X,), 512,
                       lambda x: {graph_descr['in']['data']: x[0]},
                       graph_descr['out']['prob'])

INFO:tensorflow:Restoring parameters from ../Models/25EasyFCNN/model-03.ckpt


In [40]:
dev0_P = np.vstack([x[2] for x in dev0_P0])[:,1]
train_P = np.vstack([x[2] for x in train_P0])[:,1]
test_P = np.vstack([x[2] for x in test_P0])[:,1]

In [41]:
sklearn.metrics.roc_auc_score(train_YF, train_P)*2-1,\
sklearn.metrics.roc_auc_score(test_YF, test_P)*2-1

(0.87278352525191405, 0.83465372999176313)

In [42]:
src = pd.read_csv('D:/Jupyter/DataSets/prv/raw_image.csv', delimiter=';', encoding='cp1251', quotechar='"')

In [50]:
src['p_f'] = dev0_P
src['score_f'] = -np.log(dev0_P / (1-dev0_P))*36+533
src['score_f10'] = np.clip(np.floor(src.score_f / 10).astype(np.int32) * 10, 500, 800)
src['score_f50'] = np.clip(np.floor(src.score_f / 50).astype(np.int32) * 50, 500, 800)

In [51]:
src.groupby('score_f10').agg({'trg_f':['mean', 'sum', 'count'], 'trg_m':['mean', 'sum', 'count'], 'score_f':'count', 'p_f':'mean'})

Unnamed: 0_level_0,trg_f,trg_f,trg_f,trg_m,trg_m,trg_m,score_f,p_f
Unnamed: 0_level_1,mean,sum,count,mean,sum,count,count,mean
score_f10,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
500,0.772234,356.0,461,0.045553,21.0,461,569,0.734304
510,0.658462,214.0,325,0.058462,19.0,325,406,0.619134
520,0.590717,280.0,474,0.097046,46.0,474,603,0.553833
530,0.536585,352.0,656,0.176829,116.0,656,822,0.483675
540,0.449935,346.0,769,0.271782,209.0,769,958,0.416392
550,0.354802,314.0,885,0.344633,305.0,885,1099,0.351994
560,0.257692,268.0,1040,0.418269,435.0,1040,1305,0.291399
570,0.189939,219.0,1153,0.520382,600.0,1153,1436,0.237126
580,0.130579,158.0,1210,0.538843,652.0,1210,1538,0.190706
590,0.091829,127.0,1383,0.58496,809.0,1383,1714,0.151951


In [56]:
src[src.tags.map(lambda x: type(x) is not str) & (src.score_f < 440)].head()

Unnamed: 0,rid,rdt,req_rid,img_id,img_url,user_id,user_name,user_url,upload_dt,tags,views,rating,cmts,local_url,trg_m,trg_f,p_f,score_f,score_f10,score_f50
49866,49867,2018-03-28 23:46:59.120000000,76073,64227,http://porevo.win/pics.php?q=3vPD0ghVXK8%2FcxA...,30243,666999666,http://porevo.win/index.php?action=user&id=30243,2006-12-06,,914,94000000000000004,14,D:\Jupyter\Datasets\prv\u0030243_000064227.jpg,,,0.930485,439.609955,500,500


In [57]:
def gather_stat(df, cutoff=0.01):
    tmp = dict(Counter(sum(df.tags.map(lambda x: x.split(';') if type(x) is str else []).values.tolist(), [])))
    tsum = sum(tmp.values())
    return sorted({k:v/tsum for (k,v) in tmp.items() if v > cutoff*tsum}.items(), key = lambda x: x[1], reverse=True)

In [60]:
%%time
tmp = [(x, gather_stat(src[src.score_f50==x])) for x in range(500, 801, 50)]

Wall time: 5.15 s
