In [None]:
"""
    Distillation : 지식증류의원리를이해하고
                    Teacher 모델의예측값과정답값을활용해서Student모델학습에연결시킬수있나

"""

In [None]:
""" 요약 """
def train_knowledge_distillation()
    soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
    student_prob = nn.functional.softmax(student_logits / T, dim=-1)

    soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - student_prob.log())) / student_prob.size(0) * (T**2)

    label_loss = ce_loss(student_logits, labels)

    loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss
    
def train_cosine_loss ()
    _, teacher_hidden_representation = teacher(inputs)
    hidden_rep_loss = cosine_loss(student_hidden_representation, teacher_hidden_representation,
                                          target=torch.ones(inputs.size(0)).cuda())
    label_loss = ce_loss(student_logits, labels)

    loss = hidden_rep_loss_weight * hidden_rep_loss + ce_loss_weight * label_loss
    
def train_mse_loss()
    _, teacher_feature_map = teacher(inputs)

    hidden_rep_loss = mse_loss(regressor_feature_map, teacher_feature_map)

    label_loss = ce_loss(student_logits, labels)
    
    loss = feature_map_weight * hidden_rep_loss + ce_loss_weight * label_loss


In [None]:
def train_knowledge_distillation(teacher,
                                 student,
                                 train_loader,
                                 epochs,
                                 learning_rate,
                                 T,  # temperature
                                 soft_target_loss_weight,
                                 ce_loss_weight):
    ce_loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs * len(train_loader))

    teacher.eval()  # Teacher set to evaluation mode
    student.train() # Student to train mode

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.cuda(), labels.cuda()

            optimizer.zero_grad()

            ##################### YOUR CODE STARTS HERE #####################
            # Forward pass with the teacher model - do not save gradients here as we do not change the teacher's weights
            with torch.no_grad():
                teacher_logits = teacher(inputs)

            # Forward pass with the student model
            student_logits = student(inputs)

            # Soften the student logits by applying softmax
            # Hint: nn.functional.softmax()
            soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
            student_prob = nn.functional.softmax(student_logits / T, dim=-1)

            # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network"
            soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - student_prob.log())) / student_prob.size(0) * (T**2)

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss
            ##################### YOUR CODE ENDS HERE #######################

            loss.backward()
            optimizer.step()
            scheduler.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

In [None]:
def train_cosine_loss(teacher,
                      student,
                      train_loader,
                      epochs,
                      learning_rate,
                      hidden_rep_loss_weight,
                      ce_loss_weight):
    ce_loss = nn.CrossEntropyLoss()
    cosine_loss = nn.CosineEmbeddingLoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs * len(train_loader))

    teacher.eval()  # Teacher set to evaluation mode
    student.train() # Student to train mode

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.cuda(), labels.cuda()

            optimizer.zero_grad()

            ##################### YOUR CODE STARTS HERE #####################
            # Forward pass with the teacher model and keep only the hidden representation
            with torch.no_grad():
                _, teacher_hidden_representation = teacher(inputs)

            # Forward pass with the student model
            student_logits, student_hidden_representation = student(inputs)

            # Calculate the cosine loss. Target is a vector of ones. From the loss formula above we can see that is
            # the case where loss minimization leads to cosine similarity increase.
            # Hint: cosine_loss(x, y, target)에서 target은 1로 이루어진 vector이며, torch.ones(inputs.size(0)).cuda())를 사용
            hidden_rep_loss = cosine_loss(student_hidden_representation, teacher_hidden_representation,
                                          target=torch.ones(inputs.size(0)).cuda())

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = hidden_rep_loss_weight * hidden_rep_loss + ce_loss_weight * label_loss
            ##################### YOUR CODE ENDS HERE #######################

            loss.backward()
            optimizer.step()
            scheduler.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

In [None]:
def train_mse_loss(teacher,
                   student,
                   train_loader,
                   epochs,
                   learning_rate,
                   feature_map_weight,
                   ce_loss_weight):
    ce_loss = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs * len(train_loader))

    teacher.eval()  # Teacher set to evaluation mode
    student.train() # Student to train mode

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.cuda(), labels.cuda()

            optimizer.zero_grad()

            ##################### YOUR CODE STARTS HERE #####################
            # Again ignore teacher logits
            with torch.no_grad():
                _, teacher_feature_map = teacher(inputs)

            # Forward pass with the student model
            student_logits, regressor_feature_map = student(inputs)

            # Calculate the loss
            hidden_rep_loss = mse_loss(regressor_feature_map, teacher_feature_map)

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = feature_map_weight * hidden_rep_loss + ce_loss_weight * label_loss
            ##################### YOUR CODE ENDS HERE #######################

            loss.backward()
            optimizer.step()
            scheduler.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")