In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.utils.rnn as rnn
import torch.nn.functional as F
import torch.utils.data as Data
import imgpreprocess
import torchvision.models as md
import pdb

In [2]:

def conv3x3(in_planes, out_planes):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=1, padding=1,bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):

        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes)
        self.bn1 = nn.BatchNorm2d(planes)
        self.elu = nn.ELU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride


    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.elu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.elu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block):
        self.inplanes = 1
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.elu = nn.ELU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 32)
        self.layer2 = self._make_layer(block, 64)
        self.layer3 = self._make_layer(block, 128)
        self.layer4 = self._make_layer(block, 256)
        self.avgpool = nn.AvgPool2d(5, stride=1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, block, planes):
        downsample = None
        layers = []
        layers.append(nn.Conv2d(self.inplanes, planes, kernel_size=5, stride=2, padding=0, bias=False))
        layers.append(nn.ELU())
        self.inplanes = planes * block.expansion
        layers.append(block(self.inplanes, planes))
        return nn.Sequential(*layers)

    def forward(self, x):

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = x / (torch.sqrt(torch.sum(x**2, 1)).view(-1, 1))
        x = x * 10

        return x



def resnet18(**kwargs):
    model = ResNet(BasicBlock, **kwargs)
    return model

In [3]:

class rnnmodel(nn.Module):

    def __init__(self, charcount):
        super(rnnmodel, self).__init__()
        self.charcount = charcount
        self.embed_size = 100
        self.hidden_size = 256
        self.embedding = nn.Embedding(charcount, self.embed_size)
        self.rnn = nn.LSTM(input_size=self.embed_size, hidden_size=self.hidden_size, num_layers=2, dropout=0.7, batch_first=True)
        # self.scoring = nn.Linear(self.hidden_size, charcount)


    def forward(self, x, length):
        batch_size = x.size(0)
        embed = self.embedding(x)
        hidden = None
        x_packed = rnn.pack_padded_sequence(embed, length,batch_first=True)
        output_lstm, hidden = self.rnn(x_packed)
        output_lstm,_ = rnn.pad_packed_sequence(output_lstm,batch_first=True, total_length=210)
        # output_lstm = output_lstm.view(-1, self.hidden_size)
        # output_lstm = self.scoring(output_lstm)
        return output_lstm.contiguous().view(batch_size, -1)




In [4]:

class finalmodel(nn.Module):
    def __init__(self):
        super(finalmodel, self).__init__()
        self.cnnmodel = md.resnet18(pretrained=True)
        self.rnnmodel = rnnmodel(1957)
        self.linear1 = nn.Linear(1000,4096)
        self.linear2 = nn.Linear(4096,2048)
        self.linear3 = nn.Linear(2048,2)
        self.prepro = nn.Conv2d(1 ,3,kernel_size=64, stride=1, padding=1, bias=False)
#         for parm in self.cnnmodel.parameters():
#             parm.requires_grad = False
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight.data)

    def forward(self, input_pic, input_rep, length):
#         output_rep = self.rnnmodel(input_rep,length)
        input_pic = input_pic.unsqueeze(1)
        input_pic = self.prepro(input_pic)
        output_pic = self.cnnmodel(input_pic)

#         input_cat = torch.cat((output_pic, output_rep), dim=-1)
#         input_cat = self.linear1(input_cat)
#         input_cat = F.relu(input_cat)
#         input_cat = self.linear2(input_cat)
        out = self.linear1(output_pic)
        out = F.leaky_relu(out)
        out = self.linear2(out)
        out = F.leaky_relu(out)
        out = self.linear3(out)
        return out


In [5]:
data = np.load('data/picture_data.npy')
report = np.load('data/report.npy')
labels = np.load('data/labels.npy')

In [10]:
validation = 0.1
epoch=4
batch_size=32

In [11]:

index_whole = np.arange(data.shape[2])
np.random.shuffle(index_whole)

val_data = data[:, :,index_whole[:int(data.shape[2]*0.1)]]
val_report = report[index_whole[:int(data.shape[2]*0.1)]]
val_labels = torch.Tensor(labels[index_whole[:int(data.shape[2]*0.1)]])
distribution = val_labels.sum() / val_labels.shape[0]
train_data = data[:, :, index_whole[int(data.shape[2]*0.1):]]
train_report = report[index_whole[int(data.shape[2]*0.1):]]
train_labels = torch.Tensor(labels[index_whole[int(data.shape[2]*0.1):]])
print(distribution)


In [12]:

model = finalmodel()
criterial = nn.CrossEntropyLoss(weight=torch.Tensor([1/77, 1/23]))
# criterial = nn.MSELoss()
SGDOptimizer = torch.optim.Adam(model.parameters())
train_index = torch.arange((data.shape[2] - int(data.shape[2]*0.1)))


In [13]:

train_dataset = Data.TensorDataset(train_index, train_labels)
loader = Data.DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True)
val_index = torch.arange(int(data.shape[2]*0.1))
val_dataset = Data.TensorDataset(val_index, val_labels)
val_loader = Data.DataLoader(
    dataset=val_dataset,
    batch_size = batch_size*2,
    shuffle=False
)


In [14]:
model = model.cuda()

In [None]:

for e in range(epoch):
    epoch_loss = 0
    model.train()
    for batch_num,(idx, label) in enumerate(loader):
        train_acc=0
        idx = idx.long()
        SGDOptimizer.zero_grad()
        img_imput = train_data[:,:,idx].transpose(2, 0, 1)
        rep_input = train_report[idx]
        max_len = 0
        length = torch.zeros(idx.shape[0]).long()
        label = torch.Tensor(label)
        for i in range(idx.shape[0]):
            length[i] = rep_input[i].shape[0]
            if rep_input[i].shape[0] > max_len:
                max_len = rep_input[i].shape[0]
        report_batch = torch.zeros((idx.shape[0],max_len)).long()
        for i in range(idx.shape[0]):
            report_batch[i, :length[i]] = torch.Tensor(rep_input[i]).long()
        _, sortid = torch.sort(length, dim=0, descending=True)
        length = length.index_select(0,sortid).cuda()
        report_batch = report_batch.index_select(0,sortid)
        label = label.index_select(0,sortid).cuda().long()      
        img_imput = torch.Tensor(img_imput)
        img_imput = img_imput.index_select(0,sortid).cuda()
        report_batch = report_batch.cuda()
        length = length.cuda()
        logits = model.forward(img_imput, report_batch, length)
        pred = logits.argmax(dim=1)
        t = (pred.cpu().numpy() == label.cpu().numpy())
        train_acc = np.sum(t)/pred.shape[0]
        ###for mse
#         label_tmp = torch.zeros((idx.shape[0],2))
#         for i in range(idx.shape[0]):
#             label_tmp[i,label[i]]=1
#         label = label_tmp.cuda()
        ###
        loss = criterial(logits, label)
        epoch_loss += loss
        loss.backward()
        SGDOptimizer.step()
        if batch_num%30 == 0:
            print("batch_num {0},loss {1:.4f}, acc {2:.4f}".format(batch_num, loss.item(), train_acc))
    epoch_loss = epoch_loss / (batch_num + 1)
    print("train_loss {0:.4f}".format(epoch_loss))

    with torch.no_grad():
        model.eval()
        val_loss=0
        val_acc = 0
        for batch_num, (idx, label) in enumerate(val_loader):
            idx = idx.long()
            img_imput = train_data[:, :, idx].transpose(2, 0, 1)
            rep_input = val_report[idx]
            max_len = 0
            length = torch.zeros(idx.shape[0]).long()
            sortid=sortid.cuda()
            label = torch.Tensor(label)
            for i in range(idx.shape[0]):
                length[i] = rep_input[i].shape[0]
                if rep_input[i].shape[0] > max_len:
                    max_len = rep_input[i].shape[0]
            report_batch = torch.zeros((idx.shape[0], max_len)).long()
            for i in range(idx.shape[0]):
                report_batch[i, :length[i]] = torch.Tensor(rep_input[i]).long()
            _, sortid = torch.sort(length, dim=0, descending=True)
            report_batch = report_batch.index_select(0, sortid)
            length = length.index_select(0,sortid).cuda()
            label = label.index_select(0, sortid).cuda().long()
            img_imput = torch.Tensor(img_imput)
            img_imput = img_imput.index_select(0, sortid).cuda()
            report_batch = report_batch.cuda()
            length = length.cuda()
            logits = model.forward(img_imput, report_batch, length)
            pred = logits.argmax(dim=1)
            t = (pred.cpu().numpy() == label.cpu().numpy())
            val_acc += np.sum(t)/pred.shape[0]
            ###for mse
#             label_tmp = torch.zeros((idx.shape[0],2))
#             for i in range(idx.shape[0]):
#                 label_tmp[i,label[i]]=1
#             label = label_tmp.cuda()
            ###
            loss = criterial(logits, label)
            val_loss+=loss
        val_loss = val_loss / (batch_num+1)
        val_acc = val_acc / (batch_num+1)
        print("val_loss {0:.4f}, val_acc: {1:.4f}".format(val_loss, val_acc))




batch_num 0,loss 0.8770, acc 0.2500
batch_num 30,loss 0.5301, acc 0.7812
batch_num 60,loss 0.4621, acc 0.8438
batch_num 90,loss 0.6165, acc 0.6875
batch_num 120,loss 0.4728, acc 0.7812
batch_num 150,loss 0.5895, acc 0.7500
batch_num 180,loss 0.4185, acc 0.8750
batch_num 210,loss 0.4685, acc 0.8438
batch_num 240,loss 0.5262, acc 0.7812
batch_num 270,loss 0.6620, acc 0.6562
batch_num 300,loss 0.4463, acc 0.8438
batch_num 330,loss 0.4934, acc 0.7188
batch_num 360,loss 0.3564, acc 0.9062
batch_num 390,loss 0.5088, acc 0.8125
batch_num 420,loss 0.5991, acc 0.7500
batch_num 450,loss 0.7676, acc 0.6250
batch_num 480,loss 0.4680, acc 0.7812
batch_num 510,loss 0.6815, acc 0.6562
batch_num 540,loss 0.3269, acc 0.8750
batch_num 570,loss 0.4907, acc 0.7500
batch_num 600,loss 0.6806, acc 0.5938
batch_num 630,loss 0.5612, acc 0.7812
batch_num 660,loss 0.5335, acc 0.7812
batch_num 690,loss 0.6104, acc 0.7188
batch_num 720,loss 0.6204, acc 0.6562
batch_num 750,loss 0.6376, acc 0.6562
train_loss 0.5697