<a href="https://colab.research.google.com/github/husseinfawaz20/QoT/blob/main/QoT_model_distillation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.metrics import r2_score
from statsmodels.api import OLS
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from numpy import mean
from numpy import absolute
from numpy import sqrt
import time

In [None]:
# Prepare the train and test dataset.
df = pd.read_csv('DS_LPLevel_10100_Het_CFEGN_v3.csv')
target = 'Lightpath GSNR'
X = df.drop([target], axis=1)
y = df[target]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from keras import backend as K

def get_results(file='results.csv'):
  data = pd.read_csv(file)
  return data.iloc[:,-10:]

def addResults(rmse,mae,r2,index,tt,it):
  if len(it) == 3:
      avg,max,min = it
      dt = pd.DataFrame({'type': [index],'rmse': [rmse], 'mae': [mae], 'r2' : [r2],'training time':[tt],'average inference time/data point':[avg],'max inference time/data point':[max],'min inference time/data point':[min]})
  else:
    dt = pd.DataFrame({'type': [index],'rmse': [rmse], 'mae': [mae], 'r2' : [r2],'training time':[tt],'average inference time/data point':[it],'max inference time/data point':['-'],'min inference time/data point':['-']})
  if os.path.isfile('results.csv'):
    dt.to_csv('results.csv',mode="a", header=False)
  else:
        dt.to_csv('results.csv')
def calc_inference_time(model,data_points=X.to_numpy()): #flag=0 means already performed .fit so gives avg,max,min else gives avg only

  inference_times = []
  for data_point in data_points:
          start_time = time.time()
          prediction = model.predict(data_point.reshape(1, -1))
          end_time = time.time()
          inference_time = (end_time - start_time) * 1000   # Convert to milliseconds
          inference_times.append(inference_time)

  average_inference_time = np.mean(inference_times)
  max_inference_time = np.max(inference_times)
  min_inference_time = np.min(inference_times)
  return average_inference_time,max_inference_time, min_inference_time
# Define R^2 metric
def r2(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred))
    SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
    r= (1 - SS_res/(SS_tot + K.epsilon()))
    return r


In [None]:
class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)

            # Compute scaled distillation loss from https://arxiv.org/abs/1503.02531
            # The magnitudes of the gradients produced by the soft targets scale
            # as 1/T^2, multiply them by T^2 when using both hard and soft targets.
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [None]:
# # Create the teacher
teacher = keras.Sequential(
    [
        Dense(32,  kernel_initializer='normal',input_shape=x_train.shape[1:]),
        Dense(64, kernel_initializer='normal', activation = 'relu'),
        Dense(128, kernel_initializer='normal', activation = 'relu'),
        Dense(256, kernel_initializer='normal', activation = 'relu'),
        Dense(256, kernel_initializer='normal', activation = 'relu'),
        Dense(256,  kernel_initializer='normal',activation = 'relu'),
        Dense(256, kernel_initializer='normal', activation = 'relu'),
        Dense(256, kernel_initializer='normal', activation = 'relu'),
        Dense(256, kernel_initializer='normal', activation = 'relu'),
        Dense(256,  kernel_initializer='normal',activation = 'relu'),
        Dense(128, kernel_initializer='normal', activation = 'relu'),
        Dense(64, kernel_initializer='normal', activation = 'relu'),
        Dense(1, kernel_initializer='normal', activation='linear'),
    ],
    name="teacher",
)

# Create the student
student = keras.Sequential(
    [
        Dense(16, kernel_initializer='normal', input_shape=x_train.shape[1:]),
        Dense(32, kernel_initializer='normal', activation='relu'),
        Dense(64, kernel_initializer='normal', activation='relu'),
        Dense(128, kernel_initializer='normal', activation = 'relu'),
        Dense(128, kernel_initializer='normal', activation = 'relu'),
        Dense(128, kernel_initializer='normal', activation = 'relu'),
        Dense(64, kernel_initializer='normal', activation = 'relu'),
        Dense(32, kernel_initializer='normal', activation='relu'),
        Dense(1, kernel_initializer='normal', activation='linear'),
    ],
    name="student",
)

# Clone student for later comparison
student_scratch = keras.models.clone_model(student)

In [None]:
# from keras.callbacks import EarlyStopping

# # Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss',  # Monitor validation loss
                               patience=25,  # Number of epochs with no improvement after which training will be stopped
                               verbose=1,    # To display log messages
                               restore_best_weights=True)  # Restore model weights from the epoch with the best value of the monitored quantity.
teacher.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.MeanSquaredError(),
    metrics=[
        keras.metrics.RootMeanSquaredError(name="rmse"),
        keras.metrics.MeanAbsoluteError(name="mae"),
        # r2
    ]
)

# Train and evaluate teacher on data.
start_time = time.time()
teacher.fit(x_train, y_train, epochs=500, validation_data=(x_test, y_test),  batch_size = 32,validation_split = 0.2)
end_time = time.time()
training_time = end_time - start_time
inference_time_per_data_point = calc_inference_time(teacher)

teacher_evaluate=teacher.evaluate(x_test, y_test)
# teacher.save('teacher_KD_2.keras')
rmse=teacher_evaluate[1]
mae=teacher_evaluate[2]
# r_2=teacher_evaluate[3]
print(teacher_evaluate)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
addResults(rmse,mae,'r_2','Teacher KD','training_time',inference_time_per_data_point)
# addResults(rmse,mae,'r_2','Teacher KD',training_time,(0,0,0))

get_results()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[0.087462417781353, 0.2957404553890228, 0.21915581822395325]
RMSE: 0.2957404553890228
MAE: 0.21915581822395325


Unnamed: 0.1,Unnamed: 0,type,rmse,mae,r2,training time,average inference time/data point,max inference time/data point,min inference time/data point
0,0,Teacher KD,0.335469,0.239037,0.9968141317367554,1464.4296045303345,0.0,0.0,0.0
1,0,Teacher KD,0.29574,0.219156,r_2,923.4169406890869,0.0,0.0,0.0
2,0,Student KD,0.351967,0.290439,r_2,803.0455522537231,0.0,0.0,0.0
3,0,Student KD,0.28358,0.210999,r_2,923.3434698581696,0.0,0.0,0.0
4,0,Student KD,0.28358,0.210999,r_2,923.3434698581696,81.021878,591.324329,48.841715
5,0,Teacher KD,0.29574,0.219156,r_2,training_time,79.508993,552.117586,48.779488


In [None]:
# # Initialize and compile distiller for regression
# early_stopping = EarlyStopping(monitor='val_loss',  # Monitor validation loss
#                                patience=15,  # Number of epochs with no improvement after which training will be stopped
#                                verbose=1,    # To display log messages
#                                restore_best_weights=False)  # Restore model weights from the epoch with the best value of the monitored quantity.

distiller = Distiller(student=student, teacher=teacher)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    student_loss_fn=keras.losses.MeanSquaredError(),
    distillation_loss_fn=keras.losses.MeanSquaredError(), # MSE for distillation
    metrics=[
        keras.metrics.RootMeanSquaredError(name="rmse"),
        keras.metrics.MeanAbsoluteError(name="mae")
        # r2
    ],

    alpha=0.05
    ,
    temperature=20,  # You might need to adjust this for regression or even remove it
)

# Distill teacher to student
start_time = time.time()
distiller.fit(x_train, y_train, epochs=500,validation_data=(x_test, y_test), batch_size = 32,validation_split = 0.2)
end_time = time.time()
training_time = end_time - start_time
inference_time_per_data_point = calc_inference_time(student)

# Evaluate student on test dataset
eval_results = distiller.evaluate(x_test, y_test)

rmse=eval_results[0]
mae=eval_results[1]
# r_2=eval_results[2]
print(eval_results)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
# print(f"R^2: {r_2}")
addResults(rmse,mae,'r_2','Student KD',training_time,inference_time_per_data_point)
# addResults(rmse,mae,'r_2','Student KD',training_time,(0,0,0))

get_results()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[0.283579558134079, 0.2109985500574112, 0.09891972690820694]
RMSE: 0.283579558134079
MAE: 0.2109985500574112


Unnamed: 0.1,Unnamed: 0,type,rmse,mae,r2,training time,average inference time/data point,max inference time/data point,min inference time/data point
0,0,Teacher KD,0.335469,0.239037,0.9968141317367554,1464.429605,0.0,0.0,0.0
1,0,Teacher KD,0.29574,0.219156,r_2,923.416941,0.0,0.0,0.0
2,0,Student KD,0.351967,0.290439,r_2,803.045552,0.0,0.0,0.0
3,0,Student KD,0.28358,0.210999,r_2,923.34347,0.0,0.0,0.0
4,0,Student KD,0.28358,0.210999,r_2,923.34347,81.021878,591.324329,48.841715


In [None]:
student_scratch.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.MeanAbsoluteError(),
      metrics=[
        keras.metrics.RootMeanSquaredError(name="rmse"),
        keras.metrics.MeanAbsoluteError(name="mae"),
        r2
    ],
)

# Train and evaluate student trained from scratch.
start_time = time.time()
student_scratch.fit(x_train, y_train, epochs=75)
end_time = time.time()
training_time = end_time - start_time
inference_time_per_data_point = calc_inference_time(student_scratch)

student_scratch_result=student_scratch.evaluate(x_test, y_test)

rmse=student_scratch_result[1]
mae=student_scratch_result[2]
r_2=student_scratch_result[3]
print(student_scratch_result)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R^2: {r_2}")
addResults(rmse,mae,r_2,'Student Scratch',training_time,inference_time_per_data_point)
get_results()

In [None]:
# student.save('student.keras')
# teacher.save('teacher.keras')
teacher.summary()
student.summary()

Model: "teacher"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_19 (Dense)            (None, 32)                1536      
                                                                 
 dense_20 (Dense)            (None, 64)                2112      
                                                                 
 dense_21 (Dense)            (None, 128)               8320      
                                                                 
 dense_22 (Dense)            (None, 256)               33024     
                                                                 
 dense_23 (Dense)            (None, 256)               65792     
                                                                 
 dense_24 (Dense)            (None, 256)               65792     
                                                                 
 dense_25 (Dense)            (None, 256)               6579

In [None]:
teacher = tf.keras.models.load_model("teacher_KD.keras")
# loaded.compile(
#     optimizer=keras.optimizers.Adam(),
#     metrics=[
#         keras.metrics.RootMeanSquaredError(name="rmse"),
#         keras.metrics.MeanAbsoluteError(name="mae"),
#         r2
#     ],
# )
# loaded.evaluate(x_test, y_test)

#KD with FS top 10

In [None]:
df_sf=[
    'Modulation Format Level',
    'Mean Spans Lengths',
    'Max Spans Lengths',
    'Min Spans Lengths',
    'STD Spans Lengths',
    'Ligthpath Total Traffic Load',
    'Mean Links Lengths',
    'Max Links Lengths',
    'Lightpath Lengths',
    'Root Mean Square Link Powers'
]

# Prepare the train and test dataset.
df = pd.read_csv('DS_LPLevel_10100_Het_CFEGN_v3.csv')
target = 'Lightpath GSNR'
X = df.drop([target], axis=1)
y = df[target]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)

            # Compute scaled distillation loss from https://arxiv.org/abs/1503.02531
            # The magnitudes of the gradients produced by the soft targets scale
            # as 1/T^2, multiply them by T^2 when using both hard and soft targets.
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [None]:
# # Create the teacher
teacher_fs = keras.Sequential(
    [
        Dense(32,  kernel_initializer='normal',input_shape=x_train[df_sf].shape[1:]),
        Dense(64, kernel_initializer='normal', activation = 'relu'),
        Dense(128, kernel_initializer='normal', activation = 'relu'),
        Dense(256, kernel_initializer='normal', activation = 'relu'),
        Dense(256, kernel_initializer='normal', activation = 'relu'),
        Dense(256,  kernel_initializer='normal',activation = 'relu'),
        Dense(256, kernel_initializer='normal', activation = 'relu'),
        Dense(256, kernel_initializer='normal', activation = 'relu'),
        Dense(256, kernel_initializer='normal', activation = 'relu'),
        Dense(256,  kernel_initializer='normal',activation = 'relu'),
        Dense(128, kernel_initializer='normal', activation = 'relu'),
        Dense(64, kernel_initializer='normal', activation = 'relu'),
        Dense(1, kernel_initializer='normal', activation='linear'),
    ],
    name="teacher_fs",
)

# Create the student
student_fs = keras.Sequential(
    [
        Dense(16, kernel_initializer='normal', input_shape=x_train[df_sf].shape[1:]),
        Dense(32, kernel_initializer='normal', activation='relu'),
        Dense(64, kernel_initializer='normal', activation='relu'),
        Dense(128, kernel_initializer='normal', activation = 'relu'),
        Dense(128, kernel_initializer='normal', activation = 'relu'),
        Dense(128, kernel_initializer='normal', activation = 'relu'),
        Dense(64, kernel_initializer='normal', activation = 'relu'),
        Dense(32, kernel_initializer='normal', activation='relu'),
        Dense(1, kernel_initializer='normal', activation='linear'),
    ],
    name="student_fs",
)

In [None]:
teacher_fs.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.MeanSquaredError(),
    metrics=[
        keras.metrics.RootMeanSquaredError(name="rmse"),
        keras.metrics.MeanAbsoluteError(name="mae"),
        # r2
    ]
)

# Train and evaluate teacher on data.
start_time = time.time()
teacher_fs.fit(x_train[df_sf], y_train, epochs=500, validation_data=(x_test[df_sf], y_test),  batch_size = 32,validation_split = 0.2)
end_time = time.time()
training_time = end_time - start_time
inference_time_per_data_point = calc_inference_time(teacher_fs,data_points=X[df_sf].to_numpy())

teacher_fs_evaluate=teacher_fs.evaluate(x_test[df_sf], y_test)

teacher_fs.save('teacher_fs_KD.keras')


rmse=teacher_fs_evaluate[1]
mae=teacher_fs_evaluate[2]
# r_2=teacher_evaluate[3]
print(teacher_fs_evaluate)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
addResults(rmse,mae,'r_2','Teacher AFTER FS KD',training_time,inference_time_per_data_point)
# addResults(rmse,mae,'r_2','Teacher KD',training_time,(0,0,0))

get_results()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[0.0839194655418396, 0.2896885573863983, 0.22047081589698792]
RMSE: 0.2896885573863983
MAE: 0.22047081589698792


Unnamed: 0.1,Unnamed: 0,type,rmse,mae,r2,training time,average inference time/data point,max inference time/data point,min inference time/data point
0,0,Teacher KD,0.335469,0.239037,0.9968141317367554,1464.4296045303345,0.0,0.0,0.0
1,0,Teacher KD,0.29574,0.219156,r_2,923.4169406890869,0.0,0.0,0.0
2,0,Student KD,0.351967,0.290439,r_2,803.0455522537231,0.0,0.0,0.0
3,0,Student KD,0.28358,0.210999,r_2,923.3434698581696,0.0,0.0,0.0
4,0,Student KD,0.28358,0.210999,r_2,923.3434698581696,81.021878,591.324329,48.841715
5,0,Teacher KD,0.29574,0.219156,r_2,training_time,79.508993,552.117586,48.779488
6,0,Teacher KD,0.336882,0.261984,r_2,983.6469848155975,0.0,0.0,0.0
7,0,Teacher KD,0.382865,0.294373,r_2,923.4851577281952,0.0,0.0,0.0
8,0,Student fs KD,0.274324,0.209381,r_2,909.4501760005951,0.0,0.0,0.0
9,0,Student FS KD,0.274324,0.209381,r_2,909.4501760005951,90.147741,23878.778219,49.730539


In [None]:

distiller_fs = Distiller(student=student_fs, teacher=teacher_fs)
distiller_fs.compile(
    optimizer=keras.optimizers.Adam(),
    student_loss_fn=keras.losses.MeanSquaredError(),
    distillation_loss_fn=keras.losses.MeanSquaredError(), # MSE for distillation
    metrics=[
        keras.metrics.RootMeanSquaredError(name="rmse"),
        keras.metrics.MeanAbsoluteError(name="mae")
        # r2
    ],

    alpha=0.05
    ,
    temperature=20,  # You might need to adjust this for regression or even remove it
)

# Distill teacher to student
start_time = time.time()
distiller_fs.fit(x_train[df_sf], y_train, epochs=500, validation_data=(x_test[df_sf], y_test),  batch_size = 32,validation_split = 0.2)
end_time = time.time()
training_time = end_time - start_time
inference_time_per_data_point = calc_inference_time(student_fs,data_points=X[df_sf].to_numpy())

# Evaluate student on test dataset
eval_results_fs = distiller_fs.evaluate(x_test[df_sf], y_test)
student_fs.save('student_fs_KD.keras')

rmse=eval_results_fs[0]
mae=eval_results_fs[1]
# r_2=eval_results[2]
print(eval_results_fs)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
# print(f"R^2: {r_2}")
addResults(rmse,mae,'r_2','Student FS  KD',training_time,inference_time_per_data_point)
# addResults(rmse,mae,'r_2','Student fs KD',training_time,(0,0,0))

get_results()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[0.2761097550392151, 0.21083307266235352, 0.09392797201871872]
RMSE: 0.2761097550392151
MAE: 0.21083307266235352


Unnamed: 0.1,Unnamed: 0,type,rmse,mae,r2,training time,average inference time/data point,max inference time/data point,min inference time/data point
0,0,Teacher KD,0.335469,0.239037,0.9968141317367554,1464.4296045303345,0.0,0.0,0.0
1,0,Teacher KD,0.29574,0.219156,r_2,923.4169406890869,0.0,0.0,0.0
2,0,Student KD,0.351967,0.290439,r_2,803.0455522537231,0.0,0.0,0.0
3,0,Student KD,0.28358,0.210999,r_2,923.3434698581696,0.0,0.0,0.0
4,0,Student KD,0.28358,0.210999,r_2,923.3434698581696,81.021878,591.324329,48.841715
5,0,Teacher KD,0.29574,0.219156,r_2,training_time,79.508993,552.117586,48.779488
6,0,Teacher KD,0.336882,0.261984,r_2,983.6469848155975,0.0,0.0,0.0
7,0,Teacher KD,0.382865,0.294373,r_2,923.4851577281952,0.0,0.0,0.0
8,0,Student fs KD,0.274324,0.209381,r_2,909.4501760005951,0.0,0.0,0.0
9,0,Student FS KD,0.274324,0.209381,r_2,909.4501760005951,90.147741,23878.778219,49.730539
