In [1]:
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import metrics
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torchvision import datasets, models, transforms

### Load Data

In [2]:
def load_train(path):
    train_set = pd.read_csv('train_labels.csv')
    train_label = np.array(train_set['invasive'].iloc[: ])
    train_files = []
    for i in range(len(train_set)):
        train_files.append(path + str(int(train_set.iloc[i][0])) +'.jpg')
    train_set['name'] = train_files
    return train_files, train_set, train_label

train_files, train_set, train_label = load_train("train/")

train_set.head()

Unnamed: 0,name,invasive
0,train/1.jpg,0
1,train/2.jpg,0
2,train/3.jpg,1
3,train/4.jpg,0
4,train/5.jpg,1


In [37]:
def load_test(path):
    test_set = pd.read_csv('sample_submission.csv')
    test_files = []
    for i in range(len(test_set)):
        test_files.append(path + str(int(test_set.iloc[i][0])) +'.jpg')
    test_set["name"] = test_files
    return test_files, test_set

test_files, test_set = load_test("test/")

In [4]:
def load_img(filename):
    with open(filename, 'rb') as f:
        with Image.open(f) as img_f:
            return img_f.convert('RGB').resize((320, 320))

In [40]:
class MyDataset(Dataset):
    def __init__(self, df, training=True, validating=False, transforms=None):
        if training:
            df = df.sample(frac=1)
            split_index = int(df.shape[0] * 0.8)
            if validating:
                split_data = df.values[split_index:]
            else:
                split_data = df.values[:split_index]
            imgs = [None] * split_data.shape[0]
            labels = [None] * split_data.shape[0]
            for i, row in enumerate(split_data):
                fn, labels[i] = row
                imgs[i] = load_img(fn)
        else:
            imgs = [None]*df.values.shape[0]
            for i, row in enumerate(df.values):
                fn, _ = row
                imgs[i] = load_img(fn)
        self.imgs = imgs
        self.training = training
        self.transforms = transforms
        self.num = len(imgs)
        if self.training:
            self.labels = np.array(labels, dtype=np.float32)
                 
    def __len__(self):
        return self.num
    
    def __getitem__(self, idx):
        img = self.transforms(self.imgs[idx])
        if self.training:
            img = self.transforms(self.imgs[idx])
            return img, self.labels[idx]
        else:
            return img
            

In [18]:
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomSizedCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Scale(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

  "please use transforms.RandomResizedCrop instead.")
  "please use transforms.Resize instead.")


In [19]:
def get_data_loader(dataset, training=True, validating=False, shuffle=True):
    if training and not validating:
        transkey = 'train'
    else:
        transkey = 'test'
    ds = MyDataset(dataset, training=training, validating=validating, transforms=data_transforms[transkey])
    loader = DataLoader(ds, batch_size=10, shuffle=shuffle)
    loader.num = ds.num
    return loader

In [20]:
def get_optimizer(net, lr=0.01):
    parameters = filter(lambda p: p.requires_grad, net.parameters())
    optimizer = torch.optim.SGD(parameters, lr=lr, momentum=0.9)
    return optimizer

In [27]:
weight_file = 'best_model.pth'
def train(net, criterion, optimizer, epochs=5):
    data_loaders = {'train': get_data_loader(train_set), 
                    'valid': get_data_loader(train_set, validating=True)}
    best_model = net
    best_acc = 0
    for epoch in range(epochs):
        print("Epoch {} / {}".format(epoch, epochs))
        for phase in ['train', 'valid']:
            if phase == 'train':
                optimizer = get_optimizer(net)
                net.train()
            else:
                net.train(False)
            running_loss = 0.0
            running_corrects = 0
            for img, label in data_loaders.get(phase):
                img, label = Variable(img.cuda()), Variable(label.cuda())
                optimizer.zero_grad()
                outputs = net(img)
                preds = torch.ge(outputs.data, 0.5).resize_(label.data.size())
                loss = criterion(outputs, label)
                if phase =='train':
                    loss.backward()
                    optimizer.step()
                running_loss += loss.data[0]
                running_corrects += (preds.float() == label.float()).sum().item()
            epoch_loss = running_loss / data_loaders[phase].num
            epoch_acc = running_corrects / data_loaders[phase].num
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            if phase == 'valid' and epoch_acc > best_acc:
                best_acc = epoch_acc
                torch.save(net.state_dict(), weight_file)
                best_model = net
    print('Best validation accuracy: {:4f}'.format(best_acc))
    return best_model


In [28]:
def get_dense201():
    net = models.densenet201(pretrained=True)
    net.classifier = nn.Sequential(nn.Linear(net.classifier.in_features, 1), nn.Sigmoid())
    return net.cuda()

In [29]:
def train_net():
    net = get_dense201()
    criterion = nn.BCELoss()
    optimizer = get_optimizer(net)
    train(net, criterion, optimizer)

In [30]:
def predict(net):
    loader = get_data_loader(test_set, training=False, shuffle=False)
    preds = []
    net.eval()
    for i, img in enumerate(loader, 0):
        inputs = Variable(img.cuda())
        outputs = net(inputs)
        pred = outputs.data.cpu().tolist()
        for p in pred:
            preds.append(p)
    return np.array(preds)

In [31]:
def submit(preds, filename):
    df = pd.read_csv('sample_submission.csv')
    df['invasive'] = preds
    print(df.head())
    df.to_csv(filename, index=False)

In [32]:
train_net()

  nn.init.kaiming_normal(m.weight.data)


Epoch 0 / 5


  "Please ensure they have the same size.".format(target.size(), input.size()))
  "Please ensure they have the same size.".format(target.size(), input.size()))


train Loss: 0.0356 Acc: 0.8584


  "Please ensure they have the same size.".format(target.size(), input.size()))


valid Loss: 0.0193 Acc: 0.9107
Epoch 1 / 5
train Loss: 0.0251 Acc: 0.9069
valid Loss: 0.0232 Acc: 0.9477
Epoch 2 / 5
train Loss: 0.0404 Acc: 0.8671
valid Loss: 0.0222 Acc: 0.9020
Epoch 3 / 5
train Loss: 0.0342 Acc: 0.8731
valid Loss: 1.7165 Acc: 0.3725
Epoch 4 / 5
train Loss: 0.0948 Acc: 0.6629
valid Loss: 0.1989 Acc: 0.6710
Best validation accuracy: 0.947712


In [41]:
net = get_dense201()
net.load_state_dict(torch.load(weight_file))
preds = predict(net)
submit(preds, 'submission1.csv')

  nn.init.kaiming_normal(m.weight.data)


   name  invasive
0     1  1.000000
1     2  0.038848
2     3  0.122464
3     4  0.034812
4     5  1.000000
