In [1]:
import os
import math
import numpy as np
import tensorflow as tf
from PIL import Image
import time

# 之前vgg模型，RGB通道的3个均值，我们输入图像时，需要减去这三个均值，这三个均值是写在vgg16 net的代码中的
VGG_MEAN = [103.939, 116.779, 123.68]

class VGGNet:
    """Builds VGG-16 net structure,
       load parameters from pre-train models.把预训练好的模型的权重拿进来
    """
    def __init__(self, data_dict):  
        # 把权重拿进来
        self.data_dict = data_dict
    
    def get_conv_filter(self, name): 
        # 拿卷积层的参数，name可能是conv1_2
        # self.data_dict[name][0]是w参数，[1]是偏置
        return tf.constant(self.data_dict[name][0], name='conv')
    
    def get_fc_weight(self, name):  
        # 拿全连接层参数，和上面类似
        return tf.constant(self.data_dict[name][0], name='fc')
    
    def get_bias(self, name):
        # 拿偏置
        return tf.constant(self.data_dict[name][1], name='bias')
    
    def conv_layer(self, x, name):
        """创建卷积层"""
        with tf.name_scope(name):
            conv_w = self.get_conv_filter(name)
            conv_b = self.get_bias(name)
            #第二个参数是卷积核，这个api比layers更基础，上面第三个参数是各个维度的stide
            h = tf.nn.conv2d(x, conv_w, [1,1,1,1], padding='SAME')
            h = tf.nn.bias_add(h, conv_b)
            h = tf.nn.relu(h)
            return h
    
    
    def pooling_layer(self, x, name):
        """Builds pooling layer."""
        return tf.nn.max_pool(x,
                              ksize = [1,2,2,1],
                              strides = [1,2,2,1],
                              padding = 'SAME',
                              name = name)
    
    def fc_layer(self, x, name, activation=tf.nn.relu):
        """Builds fully-connected layer."""
        with tf.name_scope(name):
            fc_w = self.get_fc_weight(name)
            fc_b = self.get_bias(name)
            h = tf.matmul(x, fc_w)  # x*w
            h = tf.nn.bias_add(h, fc_b)  # x*w+b
            if activation is None:
                return h
            else:
                return activation(h)
    
    def flatten_layer(self, x, name):  # 通过展平将卷积层展平后给全连接
        """Builds flatten layer."""
        with tf.name_scope(name):
            # [batch_size, image_width, image_height, channel]  4维张量含义
            x_shape = x.get_shape().as_list()
            dim = 1
            for d in x_shape[1:]:  # 展开，把后3个维度相乘[image_width, image_height, channel]
                dim *= d
            x = tf.reshape(x, [-1, dim])  # -1就会变为batch_size
            return x
    
    def build(self, x_rgb):
        """Build VGG16 network structure.
        Parameters:
        - x_rgb: [1, 224, 224, 3]  # 这个设置是vgg_net的设置
        """
        
        start_time = time.time()
        print('building model ...')
        
        # 原有的模型是都减去了VGG_MEAN
        r, g, b = tf.split(x_rgb, [1,1,1], axis=3)  # 切分为3份，每份只有一个通道，从轴3切割
        x_bgr = tf.concat(
            [b - VGG_MEAN[0],
             g - VGG_MEAN[1],
             r - VGG_MEAN[2]],
            axis = 3)  # 每个通道减去均值后再次合并
        
        assert x_bgr.get_shape().as_list()[1:] == [224, 224, 3]  # 做一个断言，防止后面出错
        # 这里是第一组
        self.conv1_1 = self.conv_layer(x_bgr, 'conv1_1')
        self.conv1_2 = self.conv_layer(self.conv1_1, 'conv1_2')
        self.pool1 = self.pooling_layer(self.conv1_2, 'pool1')
        # 第二组
        self.conv2_1 = self.conv_layer(self.pool1, 'conv2_1')
        self.conv2_2 = self.conv_layer(self.conv2_1, 'conv2_2')
        self.pool2 = self.pooling_layer(self.conv2_2, 'pool2')
        
        self.conv3_1 = self.conv_layer(self.pool2, 'conv3_1')
        self.conv3_2 = self.conv_layer(self.conv3_1, 'conv3_2')
        self.conv3_3 = self.conv_layer(self.conv3_2, 'conv3_3')
        self.pool3 = self.pooling_layer(self.conv3_3, 'pool3')
        
        self.conv4_1 = self.conv_layer(self.pool3, 'conv4_1')
        self.conv4_2 = self.conv_layer(self.conv4_1, 'conv4_2')
        self.conv4_3 = self.conv_layer(self.conv4_2, 'conv4_3')
        self.pool4 = self.pooling_layer(self.conv4_3, 'pool4')
        
        self.conv5_1 = self.conv_layer(self.pool4, 'conv5_1')
        self.conv5_2 = self.conv_layer(self.conv5_1, 'conv5_2')
        self.conv5_3 = self.conv_layer(self.conv5_2, 'conv5_3')
        self.pool5 = self.pooling_layer(self.conv5_3, 'pool5')
        
        # 训练的大部分时间都会花费在下面的全连接层上
        
        self.flatten5 = self.flatten_layer(self.pool5, 'flatten')
        self.fc6 = self.fc_layer(self.flatten5, 'fc6')
        self.fc7 = self.fc_layer(self.fc6, 'fc7')
        self.fc8 = self.fc_layer(self.fc7, 'fc8', activation=None)  # fc8不加激活函数是因为最后我们要进行softmax
        self.prob = tf.nn.softmax(self.fc8, name='prob')
        
        
        print('building model finished: %4ds' % (time.time() - start_time))  # 模型构建好再次打印时间

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# 测试一下模型的构建时间
vgg16_npy_path = 'vgg16.npy'
data_dict =np.load('vgg16.npy', encoding='latin1', allow_pickle=True).item()
vgg16_for_result = VGGNet(data_dict)
content = tf.placeholder(tf.float32,shape=[1,224,224,3])
vgg16_for_result.build(content)

building model ...
building model finished:    4s


In [3]:
!rm -rf run_style_transfer

In [4]:
vgg16_npy_path = 'vgg16.npy'
content_img_path = 'gugong.jpg'  # 内容图像路径
style_img_path = 'xingkong.jpeg'  # 风格图像路径

num_steps = 100  # 训练多少步
learning_rate = 10

lambda_c = 0.1   # 内容损失的系数，如果设置为0，就是只用风格特征重建图片
lambda_s = 500   # 风格损失系数，通过最终的打印就可以明白为什么这么大，如果为零，就是只有内容特征重建图片

output_dir = './run_style_transfer'  # 输出文件夹

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [6]:
for i in zip((1,2,3),(4,5,6)):
    print(i)

(1, 4)
(2, 5)
(3, 6)


In [7]:
# 对图像进行初始化
# shape表示生成张量的维度，mean是均值，stddev是标准差。truncated_normal这个函数产生正太分布，均值和标准差是自己设定的
def initial_result(shape, mean, stddev):
    initial = tf.truncated_normal(shape, mean = mean, stddev = stddev)  # 方差、标准差
    return tf.Variable(initial)  # 1.0要求是个变量

# 读取图像数据
def read_img(img_name):
    img = Image.open(img_name)
    np_img = np.array(img)  # (224, 224, 3)
    np_img = np.asarray([np_img], dtype=np.int32)  # 转维度为(1, 224, 224, 3)
    return np_img

def gram_matrix(x):
    """Calulates gram matrix
    Args:
    - x: feaures extracted from VGG Net. shape: [1, width, height, ch]
    """
    # 获取各个维度的值，b是样本数，w宽度，h高度，ch通道数
    b, w, h, ch = x.get_shape().as_list()  # x.get_shape()返回的是一个<‘TensorShape’>的元组类型，as_list()将结果转化为list类型
    features = tf.reshape(x, [b, h*w, ch]) # 因为w和h维度像素点特点一致，通过这种方式去求的相似性
    # [h*w, ch] matrix -> [ch, h*w] * [h*w, ch] -> [ch, ch]  
    # 计算任意两列的相似度，通过矩阵乘法即可，adjoint_a是把其中一个features进行转置
    # 为了防止最终的数比较大，这里除以一个常量：矩阵维度的乘积
    gram = tf.matmul(features, features, adjoint_a=True) / tf.constant(ch * w * h, tf.float32)
    return gram
    

result = initial_result((1, 224, 224, 3), 127.5, 20)  # 随机生成一个目标图像

content_val = read_img(content_img_path)  # 读内容图片值
style_val = read_img(style_img_path)  # 读风格图片值

content = tf.placeholder(tf.float32, shape=[1, 224, 224, 3])  # 这是1.0版本需要的
style = tf.placeholder(tf.float32, shape=[1, 224, 224, 3])  # placeholder需要喂输入

data_dict = np.load(vgg16_npy_path, encoding='latin1',allow_pickle=True).item()
# 创建3个vggnet
vgg_for_content = VGGNet(data_dict)
vgg_for_style = VGGNet(data_dict)
vgg_for_result = VGGNet(data_dict)

vgg_for_content.build(content)  # 内容
vgg_for_style.build(style)  # 风格
vgg_for_result.build(result)  # 结果

# 下面的层次也是超参数，多层效果比较好
# 可以加其他层特征来尝试，感受不同的效果
content_features = [
    vgg_for_content.conv1_2,
    # vgg_for_content.conv2_2,
    # vgg_for_content.conv3_3,
    # vgg_for_content.conv4_3,
    # vgg_for_content.conv5_3
]

# 结果一定要和内容的层数保持一致
result_content_features = [
    vgg_for_result.conv1_2,
    # vgg_for_result.conv2_2,
    # vgg_for_result.conv3_3,
    # vgg_for_result.conv4_3,
    # vgg_for_result.conv5_3
]

# 也给风格特征初始化层次
# feature_size, [1, width, height, channel]
style_features = [
    # vgg_for_style.conv1_2,
    # vgg_for_style.conv2_2,
    # vgg_for_style.conv3_3,
    vgg_for_style.conv4_3,
    # vgg_for_style.conv5_3
]

# 风格图像的gram矩阵，gram矩阵是两两通道之间的相似度
style_gram = [gram_matrix(feature) for feature in style_features]  # style_features是一个列表
# 给结果图像提取特征，和风格特征图像的层次必须一致
result_style_features = [
    # vgg_for_result.conv1_2,
    # vgg_for_result.conv2_2,
    # vgg_for_result.conv3_3,
    vgg_for_result.conv4_3,
    # vgg_for_result.conv5_3
]

# 结果图像的gram矩阵
result_style_gram = \
    [gram_matrix(feature) for feature in result_style_features]

content_loss = tf.zeros(1, tf.float32)
# zip: [1, 2], [3, 4], zip([1,2], [3,4]) -> [(1, 3), (2, 4)]
# shape: [1, width, height, channel]
# 因为是多层的，所以需要对每一层去计算损失，加起来均方误差的损失
for c, c_ in zip(content_features, result_content_features):
    content_loss += tf.reduce_mean((c - c_) ** 2, [1, 2, 3])  # reduce_mean分别在1，2，3轴上计算

# 风格损失是gram矩阵的损失
style_loss = tf.zeros(1, tf.float32)
for s, s_ in zip(style_gram, result_style_gram):
    style_loss += tf.reduce_mean((s - s_) ** 2, [1, 2])  # 只有两维是因为前面求gram矩阵时已经将宽高相乘降维了

# 最终的损失是内容损失和风格损失的加权
loss = content_loss * lambda_c + style_loss * lambda_s
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

Instructions for updating:
Colocations handled automatically by placer.
building model ...
building model finished:    4s
building model ...
building model finished:    2s
building model ...
building model finished:    2s
Instructions for updating:
Use tf.cast instead.


In [9]:
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    for step in range(num_steps): # 训练步骤
        loss_value, content_loss_value, style_loss_value, _ \
            = sess.run([loss, content_loss, style_loss, train_op],
                     feed_dict = {
                         content: content_val,  # 输入内容图像
                         style: style_val,  # 输入风格图像
                     })
        # 每训练一步打印一次
        print('step: %d, loss_value: %8.4f, content_loss: %8.4f, style_loss: %8.4f' 
              % (step+1,
                 loss_value[0],
                 content_loss_value[0],
                 style_loss_value[0]))  # 每次训练打印loss，content_loss，style_loss
        result_img_path = os.path.join(
            output_dir, 'result-%05d.jpg' % (step+1))  # 每一步都把结果图像存储
        result_val = result.eval(sess)[0]  # 本身维度是(1, 224, 224, 3)，[0]就是(224, 224, 3)
#         print(result_val)
        result_val = np.clip(result_val, 0, 255)  # 把值拉到0到255直接，小于0的变为0，大于255变为255
        img_arr = np.asarray(result_val, np.uint8)
        img = Image.fromarray(img_arr)  # fromarray可以将某个ndarray变为图像
        img.save(result_img_path)  # 保存图像

step: 1, loss_value: 14286.0840, content_loss: 60794.2695, style_loss:  16.4133
step: 2, loss_value: 11867.1758, content_loss: 46524.6992, style_loss:  14.4294
step: 3, loss_value: 9082.7900, content_loss: 37875.5039, style_loss:  10.5905
step: 4, loss_value: 7390.5713, content_loss: 33117.4766, style_loss:   8.1576
step: 5, loss_value: 6972.4214, content_loss: 30414.1855, style_loss:   7.8620
step: 6, loss_value: 6219.8516, content_loss: 28930.2773, style_loss:   6.6536
step: 7, loss_value: 5468.9458, content_loss: 28085.8516, style_loss:   5.3207
step: 8, loss_value: 5163.5078, content_loss: 27752.9590, style_loss:   4.7764
step: 9, loss_value: 4762.9897, content_loss: 27515.2109, style_loss:   4.0229
step: 10, loss_value: 4580.9131, content_loss: 27357.0430, style_loss:   3.6904
step: 11, loss_value: 4371.3115, content_loss: 27250.2891, style_loss:   3.2926
step: 12, loss_value: 4282.2114, content_loss: 27105.4863, style_loss:   3.1433
step: 13, loss_value: 4078.8037, content_loss: 

In [8]:
 16039.7168*0.1+500*1.6228

2415.37168