In [1]:
from __future__ import absolute_import, division, print_function
import matplotlib.pylab as plt
import numpy as np
import tensorflow as tf
import os

In [None]:
class_names = [
    'airplane',
    'automobile',
    'bird',
    'cat',
    'deer',
    'dog',
    'frog',
    'horse',
    'ship',
    'truck',
]

train_lables_file = './cifar10/trainLabels.csv'
test_csv_file = './cifar10/sampleSubmission.csv'
train_folder = './cifar10/train'
test_folder = './cifar10/test'

def parse_csv_file(filepath, folder):
    """Parses csv files into (filename(path), label) format"""
    results = []
    with open(filepath, 'r') as f:
        lines = f.readlines()
    for line in lines:
        image_id, label_str = line.strip('\n').split(',')
        image_full_path = os.path.join(folder, image_id + '.png')
        results.append((image_full_path, label_str))
        return results

train_lables_info = parse_csv_file(train_lables_file, train_folder)
test_csv_info = parse_csv_file(test_csv_file, test_folder)

import pprint
pprint.pprint(train_lables_info[0:5])
pprint.pprint(test_csv_info[0:5])
print(len(train_lables_info), len(test_csv_info))

In [None]:
#train_df = pd.DataFrame(train_labels_info)
train_df = pd.DataFrame(train_labels_info[0:45000])
valid_df = pd.DataFrame(train_labels_info[45000:])
test_df = pd.DataFrame(test_csv_info)

train_df.columns = ['filepath', 'class']
valid_df.columns = ['filepath', 'class']
test_df.columns = ['filepath', 'class']

print(train_df.head())
print(valid_df.head())
print(test_df.head())

In [None]:
height = 32
width = 32
channels = 3
batch_size = 64
num_classes = 10

train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale = 1./255,        # 图片像素点乘以 1/255
    rotation_range = 40,     # 任意旋转角度
    width_shift_range = 0.2, # 水平位移，小于1为 % ，大于1为像素点
    height_shift_range= 0.2, # 高度位移，小于1为 % ，大于1为像素点
    shear_range = 0.2,       # 剪切强度
    zoom_range = 0.2,        # 缩放强度
    horizontal_flip = True,  # 是否水平翻转
    fill_mode = 'nearest',   # 填充像素规则
)

train_generator = train_datagen.flow_from_dataframe(
        train_df,            # 含有标签列表的datafarame
        directory = './',    # 训练数据所在路径
        x_col = 'filepath'
        y_col = 'class',
        classes = class_names,
        target_size=(height, width),
        batch_size=batch_size,
        seed = 7,   # 随机数
        shuffle = True, # 是否混淆
        class_mode = 'sparse')

valid_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale = 1./255)
valid_generator = train_datagen.flow_from_dataframe(
        valid_df,
        directory = './',
        x_col = 'filepath'
        y_col = 'class',
        classes = class_names,
        target_size=(height, width),
        batch_size=batch_size,
        seed = 7,   # 随机数
        shuffle = True, # 是否混淆
        class_mode = 'sparse')

In [None]:
train_num = train_generator.samples
valid_num = valid_generator.samples
print(train_num, valid_num)

In [None]:
# 从 generator 中取数据
for _ in range(2):
    x, y = train_generator.next()
    print(x.shape, y.shape)
    print(y)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(128, 3, padding='same', activation='relu', input_shape=[width, height, channels],
                        kernel_initializer='lecun_normal',
                        bias_initializer='lecun_normal',),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(128, 3, padding='same', activation='relu',
                        kernel_initializer='lecun_normal',
                        bias_initializer='lecun_normal',),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    
    tf.keras.layers.Conv2D(256, 3, padding='same', activation='relu',
                        kernel_initializer='lecun_normal',
                        bias_initializer='lecun_normal',),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(256, 3, padding='same', activation='relu',
                        kernel_initializer='lecun_normal',
                        bias_initializer='lecun_normal',),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    
    tf.keras.layers.Conv2D(512, 3, padding='same', activation='relu',
                        kernel_initializer='lecun_normal',
                        bias_initializer='lecun_normal',),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(512, 3, padding='same', activation='relu',
                        kernel_initializer='lecun_normal',
                        bias_initializer='lecun_normal',),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax'),
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                    loss=tf.keras.losses.sparse_categorical_crossentropy,
                    metrics=['accuracy'])

model.summary()

In [None]:
epochs = 30
history = model.fit_generator(train_generator,
                             steps_per_epoch = train_num // batch_size,
                             epochs = epochs,
                             validation_data = valid_generator,
                             validation_steps = valid_num // batch_size)

In [None]:
from matplotlib import pyplot as plt
def plot_learning_curves_acc(history, y_min=0, y_max=1):
    data = {}
    for label in history.history.keys():
        if 'acc' in label:
            data[label] = history.history[label]
    pd.DataFrame(data).plot(figsize=(8, 5))
    plt.grid(True)
    plt.gca().set_ylim(y_min, y_max)
    plt.show()


def plot_learning_curves_loss(history, y_min=0, y_max=1):
    data = {}
    for label in history.history.keys():
        if 'loss' in label:
            data[label] = history.history[label]
    pd.DataFrame(data).plot(figsize=(8, 5))
    plt.grid(True)
    plt.gca().set_ylim(y_min, y_max)
    plt.show()
    
%matplotlib inline

plot_learning_curves_acc(history)
plot_learning_curves_loss(history, 0, 3)

In [None]:
test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale = 1./255)
test_generator = train_datagen.flow_from_dataframe(
        test_df,
        directory = './',
        x_col = 'filepath'
        y_col = 'class',
        classes = class_names,
        target_size=(height, width),
        batch_size=batch_size,
        seed = 7,   # 随机数
        shuffle = True, # 是否混淆
        class_mode = 'sparse')

In [None]:
test_predict = model.predict_generator(test_generator,
                                        workers = 10,    # 并行度
                                        use_multiprocessing = True)   # True 分10个进程做并行化，False 分10个线程做并行化

In [None]:
print(test_predict.shape)

In [None]:
print(test_predict[0:5])

In [None]:
test_predict_class_indices = np.argmax(test_predict, axis = 1)
print(test_predict_class_indices)

In [None]:
test_predict_class = [class_names[index]
                      for index in test_predict_class_indices]
print(test_predict_class[0:5])

In [None]:
def generate_submissions(filename, predict_class):
    with open(filename, 'w') as f:
        f.write('id, label\n')
        for i in range(len(predict_class)):
            f.write('{:d},{}\n'.format(i+1, predict_clas[i]))

output_file = "./cifar10/submission.csv"
generate_submissions(output_file, test_predict_class)