In [None]:
### 从原始cifar10数据开始，进行图像分类。

# 1. 数据预处理
### 读取标签文件
### 根据标签文件，将文件名和文件label生成一个dict数据结构
### 拆分train, val数据, 加载test数据
### 增广
### 生成train_iter, val_iter, test_iter
### 

# 2. 建立模型
# 3. 训练模型
# 4. 测试


In [2]:
# import相关包

import os 
import shutil
import time
import torchvision
import torch
import numpy as np
import sys
sys.path.append('../d2lzh/')
import d2lzh_pytorch as d2l

In [4]:
### 1.1下载数据集
### 1.2解压数据集
'''
    ../data/kaggle_cifar10/train/[1-50000].png；
    ../data/kaggle_cifar10/test/[1-300000].png；
    ../data/kaggle_cifar10/trainLabels.csv。
'''

# 考虑到原文件数据太多，先抽取500张训练样本，5张测试样本，这样容易上手。
'''
    ../data/kaggle_cifar10/train_tiny/[1-500].png；
    ../data/kaggle_cifar10/test/[1-5].png；
    ../data/kaggle_cifar10/trainLabels.csv。
'''

demo = False
# if demo:
#     import zipfile
#     for f in ['train_tiny.zip', 'test_tiny.zip', 'trainLables.csv.zip']:
#         with zipfile.Zipfile('../data/kaggle_cifar10/' + f, 'r') as z:
#             z.extractall('../data/kaggle_cifar10/')
            

### 1.3整理数据集
def read_label_file(data_dir, label_file, train_dir, valid_ratio):
    """
    Function:
        读取label.csv文件，将数据集的大概信息（总数，每一类平均数等）整理出来。
    Args:
        data_dir: train， val数据集的路經
        label_file: label.csv文件名
        train_dir: train数据集的文件名：train 或 train_tiny
        valid_ratio: 在提供的训练集中，分出用于验证的比例
    Return:
        n_train_per_label: 每一类的平均数。
        idx_label: 字典结构，“文件名”及其对应的"label"
    """
    
    with open(os.path.join(data_dir, label_file), 'r') as f:
        lines = f.readlines()[1:]
        tokens = [l.rstrip().split(',') for l in lines]
        idx_label = dict((int(idx), label) for idx, label in tokens)
    
    labels = set(idx_label.values())
    n_train_valid = len(os.listdir(os.path.join(data_dir, train_dir)))
    n_train = int(n_train_valid * (1 - valid_ratio))
    n_train_per_label = n_train // len(labels)
    assert 0 < n_train < n_train_valid
    
    return n_train_per_label, idx_label


###　检查某路径下文件名是否存在，如无，在该路径下创建它
def mkdir_if_not_exist(path):
    if not os.path.exists(os.path.join(*path)):
        os.mkdir(os.path.join(*path))
        
def reorg_train_valid(data_dir, train_dir, input_dir, n_train_per_label, idx_label):
    """
    Function:
        将给定的train数据进行整理归类，划分好训练集和测试集．
    Args:%%!
        data_dir: "数据的路径"
        train_dir:　"train" or "train_tiny"
        input_dir: "train_val_test"
        n_train_per_label： 每个label类至少要有多少张图片
        idx_label: 字典结构，“文件名”及其对应的"label"
    Return:
        已经重新整理的文件：
        data_dir/input_dir/train/label/xxx.jpg
        data_dir/input_dir/valid/label/xxx.jpg
    """
    
    label_count = {}
    for train_file in os.listdir(os.path.join(data_dir, train_dir)):
        idx = int(train_file.split('.')[0])  # 因为文件是按１,2,3.png...来命名的
        label = idx_label[idx]
        mkdir_if_not_exist([data_dir, input_dir, 'train_valid', label])
        
        shutil.copy(os.path.join(data_dir, train_dir, train_file),
                   os.path.join(data_dir, input_dir, 'train_valid', label))
        
        if label not in label_count or label_count[label] < n_train_per_label:
            mkdir_if_not_exist([data_dir, input_dir, 'train', label])
            shutil.copy(os.path.join(data_dir, train_dir, train_file),
                       os.path.join(data_dir, input_dir, 'train', label))
            label_count[label] = label_count.get(label, 0) + 1
            
        else: 
            mkdir_if_not_exist([data_dir, input_dir, 'valid', label])
            shutil.copy(os.path.join(data_dir, train_dir, train_file),
                       os.path.join(data_dir, input_dir, 'valid', label))
            
            
def reorg_test(data_dir, test_dir, input_dir):
    """
    Function:
        将给定的test数据放到自己定义的路径下面．
    Args:%%!
        data_dir: "数据的路径"
        test_dir:　"test" or "test_tiny"
        input_dir: "train_val_test""
    Return:
        已经重新整理的文件：
        data_dir/input_dir/test/unknown/xxx.jpg
    """
    mkdir_if_not_exist([data_dir, input_dir, 'test', 'unknown'])
    for test_file in os.listdir(os.path.join(data_dir, test_dir)):
        shutil.copy(os.path.join(data_dir, test_dir, test_file),
                   os.path.join(data_dir, input_dir, 'test', 'unknown'))
        
        
def reorg_cifar10_data(data_dir, label_file, train_dir, test_dir,
                      input_dir, valid_ratio):
    n_train_per_label, idx_label = read_label_file(data_dir,
                                                  label_file,
                                                  train_dir,
                                                  valid_ratio)
    reorg_train_valid(data_dir, train_dir, input_dir, n_train_per_label, idx_label)
    reorg_test(data_dir, test_dir, input_dir)

In [5]:
if demo:
    train_dir, test_dir =  'train_tiny', 'test_tiny'
    batch_size = 1
    label_file = 'trainLablesTiny.csv'
else:
    train_dir, test_dir, batch_size = 'train', 'test', 64
    label_file = 'trainLabels.csv'

data_dir = '../Datasets/cifar-10/'
input_dir = 'train_valid_test'
valid_ratio = 0.1
# reorg_cifar10_data(data_dir, label_file, train_dir, test_dir, input_dir, valid_ratio)

In [None]:
### 数据增广

In [6]:
transform_train = torchvision.transforms.Compose([
        torchvision.transforms.Resize(40),
        torchvision.transforms.RandomResizedCrop(32, 
                                                scale=(0.64, 1.0),
                                                ratio=(1.0, 1.0)),
        torchvision.transforms.RandomHorizontalFlip(),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize([0.4914, 0.4822, 0.4465],
                                        [0.2023, 0.1994, 0.2010])        
    ])

transform_test = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(
            [0.4914, 0.4822, 0.4465],
            [0.2023, 0.1994, 0.2010]
        )
    ])

In [7]:
### 数据集准备

# 读取数据集
# train_data_path = os.path.join(data_dir, input_dir, 'train')
# train_data = torchvision.datasets.ImageFolder(root=train_data_path, transform=transform_train)

# valid_data_path = os.path.join(data_dir, input_dir, 'valid')
# valid_data = torchvision.datasets.ImageFolder(root=valid_data_path, transform=transform_train)

train_val_data_path = os.path.join(data_dir, input_dir, 'train_valid')
train_val_data = torchvision.datasets.ImageFolder(root=train_val_data_path, transform=transform_train)

test_data_path = os.path.join(data_dir, input_dir, 'test')
test_data = torchvision.datasets.ImageFolder(root=test_data_path, transform=transform_test)

# 根据batch size大小进行封装
# train_iter = torch.utils.data.DataLoader(train_data, batch_size, shuffle=True)
# valid_iter = torch.utils.data.DataLoader(valid_data, batch_size, shuffle=True)
train_valid_iter = torch.utils.data.DataLoader(train_val_data, batch_size, shuffle=True)
test_iter  = torch.utils.data.DataLoader(test_data, batch_size, shuffle=False)

### 定义模型（resnet18)

In [8]:
net = d2l.resnet18_cifar10(output=10, in_channels=3)
# print(net)

### 训练函数

In [15]:
def train(net, train_iter, valid_iter, loss, optimizer, device, num_epochs):
    """
    Function: 
        提供训练数据，进行训练。每训练完一个Epoch, 使用验证集进行验证    
    """
    net = net.to(device)
    print("training on:", device)
    batch_count = 0
   
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            ls = loss(y_hat, y)
            optimizer.zero_grad()
            ls.backward()
            optimizer.step()
            
            train_l_sum += ls.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
            
        if valid_iter is not None:            
            valid_acc = d2l.evaluate_accuracy(valid_iter, net)
            valid_output_str = ("train acc %.5f, valid_acc %.5f," % (train_acc_sum / n, valid_acc))
        else:
            valid_output_str = ("train acc %.5f, " % (train_acc_sum / n))

        print("Epoch %d, loss %.5f, " % (epoch + 1, train_l_sum / batch_count) + 
              valid_output_str + "time %.2f." % (time.time() - start))
            

In [31]:
# 训练

loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.005)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_epochs = 500

# train(net, train_iter, valid_iter, loss, optimizer, device, num_epochs)

In [32]:
# 进入测试环节（用所有训练集训练，所有的测试集进行测试）

import pandas as pd

train(net, train_valid_iter, None, loss, optimizer, device, num_epochs)

preds = []
for X, _ in test_iter:
    X = X.to(device)
    net = net.to(device)
    y_hat = net(X)
#     print(y_hat.argmax(dim=1).cpu())
    preds.extend(y_hat.argmax(dim=1).cpu())   # 将每个样本的预测值计算出来。预测结果如 0,1,2,...,9

sorted_ids = list(range(1, len(test_data) + 1))
sorted_ids.sort(key=lambda x: str(x))

df = pd.DataFrame({'id': sorted_ids, 'label':preds})
# df['label'] = df['label'].apply(lambda x: train_val_data.synsets[x])
df['label'] = df['label'].apply(lambda x: train_val_data.classes[x])
df.to_csv('submission.csv', index=False)


training on: cuda
Epoch 1, loss 0.33452, train acc 0.88432, time 31.57.
Epoch 2, loss 0.16476, train acc 0.88608, time 30.97.
Epoch 3, loss 0.10463, train acc 0.89214, time 30.94.
Epoch 4, loss 0.07745, train acc 0.89298, time 31.10.
Epoch 5, loss 0.06032, train acc 0.89594, time 31.11.
Epoch 6, loss 0.04882, train acc 0.89900, time 32.34.
Epoch 7, loss 0.04012, train acc 0.90282, time 31.05.
Epoch 8, loss 0.03450, train acc 0.90422, time 31.10.
Epoch 9, loss 0.02963, train acc 0.90816, time 31.12.
Epoch 10, loss 0.02582, train acc 0.91020, time 31.07.
Epoch 11, loss 0.02341, train acc 0.91218, time 31.10.
Epoch 12, loss 0.02107, train acc 0.91262, time 31.21.
Epoch 13, loss 0.01899, train acc 0.91472, time 31.24.
Epoch 14, loss 0.01701, train acc 0.91788, time 31.23.
Epoch 15, loss 0.01547, train acc 0.91962, time 30.84.
Epoch 16, loss 0.01445, train acc 0.92138, time 31.42.
Epoch 17, loss 0.01358, train acc 0.92094, time 30.94.
Epoch 18, loss 0.01227, train acc 0.92492, time 30.95.
E

In [29]:
import pandas as pd

preds = [0, 2, 3, 4, 5, 2, 9, 9, 8, 6]

sorted_ids = list(range(1, 10+1))
print(sorted_ids)
sorted_ids.sort(key=lambda x: str(x))
print(sorted_ids)

df = pd.DataFrame({'id': sorted_ids, 'label':preds})
print(df)


unk = train_val_data.classes
print(unk)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[1, 10, 2, 3, 4, 5, 6, 7, 8, 9]
   id  label
0   1      0
1  10      2
2   2      3
3   3      4
4   4      5
5   5      2
6   6      9
7   7      9
8   8      8
9   9      6
['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']


In [None]:
# My first submission. Just a baseline. I will submit in later as the epochs set in a large number, at this time, the epoch is 2.