#Put your Google Colab link here:
*your link here*

## Important notice: any use of generative AI for completing the assignment is strictly prohibited.

## Get access to a GPU:
To gain access to the GPUs on Colab, navigate to the `Runtime` tab above and select `Change runtime type`.

In [None]:
# use if working in colab
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


## Import packages

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

### Warning: to ensure the reproducibility of your results and to achieve the full grade, do not change or remove RANDOM_STATE variables and setting random seed statements. If you remove or change them, you may not get the full grade.

In [None]:
random_state = 5

## Part 1: Data Loading and Preprocessing (3 points)

Objectives:
- Load and understand the dataset structure
- Implement data reduction strategy
- Prepare training/validation/test splits


### Part 1.1: Data Loading (1 point)
Load the dataset from Google Drive and check basic statistics

- HINT: Use the provided dataloader function

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# you should be added as viewer to shared Google drive "ECE477 datasets"
#  at https://drive.google.com/drive/u/0/folders/0ABIZHKB-QPnRUk9PVA


# the dataloader can load the original size data, and the reduced size data
def dataloader(file_path):
  data = np.load(file_path)
  X_train, y_train, X_validation, y_validation, X_test, y_test = (data['X_train'],data['y_train'],data['X_validation'],data['y_validation'],data['X_test'],data['y_test'])

  return X_train, X_validation, X_test, y_train, y_validation, y_test

file_path = '/content/drive/Shared drives/ECE477 datasets/Assignment3/Covertype_original_data.npz'
# Load the dataset
# YOUR CODE HERE


# check the shape of data
print(X_train.shape)
print(y_train.shape)
print(X_validation.shape)
print(y_validation.shape)
print(X_test.shape)
print(y_test.shape)


### Part 1.2: Data Reduction (2 point)
Implement data reduction based on compression ratio

In [None]:
# function to reduce the dataset size for our experiments
# We apply the dataset reduction to the training and validation set
# We randomly pick data points from the training and validation sets
def data_reduction(X_train, X_validation, y_train, y_validation, size_train, size_validation):

  print('Data size reduction method')

  # selecting random rows from the training set
  R = np.random.RandomState(random_state)
  rows = R.randint(X_train.shape[0], size=size_train)

  # YOUR CODE HERE

  print(f"X_train.shape: {X_train.shape}")
  print(f"y_train.shape: {y_train.shape}")

  # selecting random rows from the validation set
  rows = R.randint(X_validation.shape[0], size=size_validation)

  # YOUR CODE HERE

  return X_train, X_validation, y_train, y_validation

In [None]:
# We can use this part to generate reduced size datasets for various data compression ratios
# setting the data compression ratio
data_compression_ratio = 20

# computing the size of the training and validation set based on the data compression ratio
size_train = int(X_train.shape[0]/data_compression_ratio)
size_validation = int(X_validation.shape[0]/data_compression_ratio)

# Reduce the dataset size with the data_compression_ratio set above
# YOUR CODE HERE

# Saving the reduced size data - we can name the data to include the data compression ratio
# Do not save to shared drive; save to your drive
reduced_file_path = # YOUR CODE HERE
np.savez(reduced_file_path,
    X_train=X_train_reduced,
    y_train=y_train_reduced,
    X_validation=X_validation_reduced,
    y_validation=y_validation_reduced,
    X_test=X_test, # The test set for the reduced size data is the same as the original data
    y_test=y_test
    )
print("reduced size data successfully saved.")

In [None]:
# load reduced data
X_train, X_validation, X_test, y_train, y_validation, y_test = dataloader(reduced_file_path)
print ("Loading complete !")

## Part 2: Synthetic Data Generation (8 points)
Objectives:
- Implement Kernel Density Estimation (KDE) for synthetic data generation
- Validate synthetic data using semantic integrity classifer
- Label synthetic data with Random Forest

### Part 2.1: KDE Implementation (4 points)
Complete the KDE sampling function

In [None]:
from sklearn.neighbors import KernelDensity

def KDE_sample_generation (X_train, X_validation):

  # search over different bandwidth, find the best one
  bw_list = [0.5, 0.7, 1, 1.5, 2, 3]

  log_like = np.zeros((len(bw_list)))
  for i, bw in enumerate(bw_list):
    print(f"bw = {bw}")
    # create KernelDensity model with: kernel='gaussian', bandwidth=bw
    # fit using training data
    # Compute the total log-likelihood of validation data under the model (check avalible functions in sklearn.neighbors.KernelDensity), and save in log_like
    # YOUR CODE HERE

  bbw = bw_list[np.argmax(log_like)]
  print (f"Best Bandwidth: {bbw}")

  # create model with Best Bandwidth, fit
  # YOUR CODE HERE

  # sample 20000 samples from the model
  X_syn = # YOUR CODE HERE
  print(f"X_syn.shape: {X_syn.shape}")

  for i in range(X_syn.shape[0]):
    wilderness_areas_sample = []
    max_value = float("-inf")
    max_index = 0
    for j in range(10, 14):
      wilderness_areas_sample.append(X_syn[i,j])
      max_index = max (max_index, j)

    for j in range(10, 14):
      if j == max_index:
        X_syn[i,j] = 1
      else:
        X_syn [i,j] = 0

  return X_syn

### Part 2.2: Semantic integrity classifer
Define semantic integrity classifer. Note that you don't need to run this function at the end.

In [None]:
def softmax(vector):
  e = np.exp(vector)
  return (e / sum(e))

# semantic integrity classifer predicts the value of one categorical feature with respect to continuous features of the data
# In this case, we had one categorical feature
# The categorical feature is one-hot encoded
def semantic_integrity_classifier(X_syn, X_train, X_validation):

  X_train_total = np.concatenate((X_train, X_validation))

  # indices for the continous columns for this dataset
  indices = np.array([list(np.arange(0,10))+list(np.arange(14,15))]).reshape(-1)
  print(indices)


  #finding the label for each data instance, the label is the value of te categorical column
  # if we have multiple categorical columns we need to do this multiple times
  # in this case the one-hot encoding of the categorical feauture is in columns 10-11-12-13.
  label = np.zeros((X_train_total.shape[0]))
  for i in range(X_train_total.shape[0]):
    wilderness_areas_sample = X_train_total [i, 10:14]
    wilderness_areas_sample = softmax(wilderness_areas_sample)
    index = np.argmax(wilderness_areas_sample)
    label [i] = index

  # label train shows the value of the categorical feature in the training data
  label_train = np.zeros((X_train.shape[0]))
  for i in range(X_train.shape[0]):
    wilderness_areas_sample = X_train [i, 10:14]
    wilderness_areas_sample = softmax(wilderness_areas_sample)
    index = np.argmax(wilderness_areas_sample)
    # print ("index: ", index)
    label_train[i] = index

  # label validation shows the value of the categorical feature in the validation data
  label_validation = np.zeros((X_validation.shape[0]))
  for i in range(X_validation.shape[0]):
    wilderness_areas_sample = X_validation[i, 10:14]
    wilderness_areas_sample = softmax(wilderness_areas_sample)
    index = np.argmax(wilderness_areas_sample)
    # print ("index: ", index)
    label_validation [i] = index

  # Semantic integrity classifer

  clf = RandomForestClassifier()

  #computing the train and validation accuracy of the semantic integrity classifier
  #
  clf.fit(X_train[:, indices], label_train)
  y_pred_train = clf.predict(X_train[:, indices])  # Plug in values here!!
  train_acc = accuracy_score(label_train, y_pred_train)
  print('Train accuracy: {})'.format(train_acc))

  # Predict the response for validation set
  y_pred_validation = clf.predict(X_validation[:, indices])  # Plug in values here!!!
  val_acc = accuracy_score(label_validation, y_pred_validation)
  print('Val accuracy: {})'.format(val_acc))

  # Using all the data to train the final classifier
  clf.fit(X_train_total[:, indices], label)
  X_syn_final = np.zeros((1, X_syn.shape[1]))
  print("Shape of X_syn: ", X_syn.shape)
  print("Shape of X_syn_final: ", X_syn_final.shape)


  # we predict the value of categorical feature of the synthetic data with respect to continous features of the synthetic data
  # if not match, we disregard that synthetic data record
  for i in range(X_syn.shape[0]):
    predict = clf.predict(X_syn[i, indices].reshape(1, -1))

    index = np.argmax(softmax(X_syn[i, 10:14]))

    # if the value of the categorical value is semantically correct, we consider that synthetic data point
    if predict == index:
      for j in range(10, 14):
        if j == index+10:
          X_syn[i, j] = 1
        else:
          X_syn[i, j] = 0

      X_syn_final = np.concatenate((X_syn_final, X_syn[i, :].reshape(1,-1)), axis= 0)

  np.delete(X_syn_final, 0, 0)
  print ("Shape of final X_syn: ", X_syn_final.shape)

  return X_syn_final

#### Use above functions to generate synthetic data (1 point)

In [None]:
# generating the synthetic data using KDE. This may take several minutes, please be patient
# YOUR CODE HERE

In [None]:
# use semantic integrity classifer to verify the semantic integrity of the categorical feature values,
# you don't need to run this cell, because it takes too long. We will provide the synthetic data for later use
# YOUR CODE HERE

### Part 2.3: Synthetic data labeling (4 points)
- Implement Random Forest for synthetic data labeling
- Note that We will load the synthetic data from ECE477 datasets since the final part of synthetic data generation semantic_integrity_classifier() takes too long to run on Colab

#### Search for best max_depth (2 points)

In [None]:
# load synthetic data
X_syn = np.load('/content/drive/Shared drives/ECE477 datasets/Assignment3/X_syn_original_1.npy')

In [None]:
# Using the Random forest model to label the synthetic data
validation_accs = []
train_accs = []

# go over different max_depth of Random forest
for i in range (1, 25):
  print(f"training Random forest max_depth={i}")
  clf = RandomForestClassifier(n_estimators=350, max_depth=i, criterion='gini', random_state=random_state)

  # fit the model, and save training accuracy in train_accs, validation accuracy in validation_accs
  # YOUR CODE HERE

In [None]:
plt.title('Training and validation accuracy vs. Tree depth')
plt.plot(train_accs, label='Training accuracy')
plt.plot(validation_accs, label='Validation accuracy')
plt.legend()
plt.show()

best_tree_id = # YOUR CODE HERE
best_tree_depth = best_tree_id + 1
print(f"Baseline Random Forest depth={best_tree_depth}")
print(f"train acc={train_accs[best_tree_id]}")
print(f"validation acc={validation_accs[best_tree_id]}")

#### Which max_depth is the best? Explain the reason. (1 point)
Your answer here

#### fit a Random Forset with the best max_depth, and use it to label synthetic data (1 point)

In [None]:
# after finding the best depth, we will retrain a random forset with both training and validation data

X_train_total = np.concatenate((X_train, X_validation), axis=0)
y_train_total = np.concatenate((y_train, y_validation), axis=0)

clf = RandomForestClassifier(n_estimators=350, max_depth=best_tree_depth, criterion='gini', random_state=random_state)
# fit the model with total data, and report training accuracy
# YOUR CODE HERE
print(f"train acc={train_acc}")

# label the synthetic data with current model
y_syn = # YOUR CODE HERE
print(f"y_syn.shape: {y_syn.shape}")



## Part 3: Model Training (9 points)
Objective:
- Build and train neural network with Schema A

Note that training may take 3-10 minutes to finish. You could train for fewer epochs when debugging

### Part 3.1: Neural Network Architecture (2 points)
Complete the DynamicNet class definition

In [None]:
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

# DNN model defination
class DynamicNet(torch.nn.Module):
  def __init__(self, D_in, H_1=200, H_2=100, D_out=7):
    super(DynamicNet, self).__init__()
    self.input_linear = nn.Linear(D_in, H_1)
    self.middle_linear = nn.Linear(H_1, H_2)
    self.output_linear = nn.Linear(H_2, D_out)

  def forward(self, x):
    # Add F.leaky_relu as the activation function after self.input_linear and self.middle_linear.
    # input --> input_linear, activation, middle_linear, activation, output_linear --> output
    # YOUR CODE HERE


### Part 3.2: Baseline model training (3 points)
Train a DNN with reduced original data.

In [None]:
# Set a fixed random seed for reproducibility
import random
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
if device == 'cuda':
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

# training the baseline DNN model with reduced original data
# this may take several minutes
print ("Baseline NN")

x = Variable(torch.from_numpy(X_train).type(torch.FloatTensor))
y = Variable(torch.from_numpy(y_train.reshape(-1)).type(torch.LongTensor))

# put x and y to device
# YOUR CODE HERE

# initialize model only with the input dimension (match the data)
# put model to device
# YOUR CODE HERE

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2 )

for t in range(1500):
  # train for 1500 epoch
  # YOUR CODE HERE


correct = 0
total = X_test.shape[0]
with torch.no_grad():
  for i in range (X_test.shape[0]):
    x_t = torch.from_numpy(X_test[i,:]).type(torch.FloatTensor)
    x_t = Variable(x_t)
    x_t = x_t.to(device)

    y_t = torch.from_numpy(np.asarray(y_test[i])).type(torch.LongTensor)
    y_t = Variable(y_t)
    y_t = y_t.to(device)

    output = model.forward(x_t)
    np_output = (output.cpu()).numpy()
    y_pred = np.argmax(np_output)
    label = int(y_test[i])
    if (y_pred == label):
      correct += 1
print ('Test Accuracy:', 100* correct/total)

### Part 3.3: Learning Scheme A (3 points)
Implement the two-phase training process: Scheme A uses the synthetic data to pretrain the network architecture. Pretraining is followed by use of the real training dataset to fine-tune.

In [None]:
# Set a fixed random seed for reproducibility
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
if device == 'cuda':
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

# This method uses the synthetic data for DNN pretraining, and uses the real data for final training
print ("Schema A")

# Loading the synthetic data
x= Variable(torch.from_numpy(X_syn).type(torch.FloatTensor))
y = Variable(torch.from_numpy(y_syn.reshape(-1)).type(torch.LongTensor))

# put data to device
# YOUR CODE HERE

# initialize model only with the input dimension (match the data)
# put model to device
# YOUR CODE HERE

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2 )

# stage 1: pretraining with the synthetic data
for t in range(500):
  # train for 500 epoch
  # YOUR CODE HERE


X_all = np.concatenate((X_train, X_validation))
y_all = np.concatenate((y_train, y_validation))

x= Variable(torch.from_numpy(X_all).type(torch.FloatTensor))
y = Variable(torch.from_numpy(y_all.reshape(-1)).type(torch.LongTensor))

# put data to device
# YOUR CODE HERE

# stage 2: final training with real data
for t in range(1500):
  # train for 1500 epoch
  # YOUR CODE HERE


correct = 0
total = X_test.shape[0]
with torch.no_grad():
  for i in range (X_test.shape[0]):
    x_t = torch.from_numpy(X_test[i,:]).type(torch.FloatTensor)
    x_t = Variable(x_t)
    x_t = x_t.to(device)

    y_t = torch.from_numpy(np.asarray(y_test[i])).type(torch.LongTensor)
    y_t = Variable(y_t)
    y_t = y_t.to(device)

    output = model.forward(x_t)
    np_output = (output.cpu()).numpy()
    y_pred = np.argmax(np_output)
    label = int(y_test[i])
    if (y_pred == label):
      correct += 1
print ('Test Accuracy:', 100* correct/total)


## Compare Sechema A with baseline, comment on the performance differnce. (1 point)
Your answer here