In [3]:
import matplotlib.pyplot as plt
import numpy as np
import os

def load_gmm_dataset(csv_path):
    """Load dataset for Gaussian Mixture Model.

    Args:
         csv_path: Path to CSV file containing dataset.

    Returns:
        x: NumPy array shape (n_examples, dim)
        z: NumPy array shape (n_exampls, 1)

    NOTE: You do not need to edit this function.
    """

    # Load headers
    with open(csv_path, 'r') as csv_fh:
        headers = csv_fh.readline().strip().split(',')

    # Load features and labels
    x_cols = [i for i in range(len(headers)) if headers[i].startswith('x')]
    z_cols = [i for i in range(len(headers)) if headers[i] == 'z']

    x = np.loadtxt(csv_path, delimiter=',', skiprows=1, usecols=x_cols, dtype=float)
    z = np.loadtxt(csv_path, delimiter=',', skiprows=1, usecols=z_cols, dtype=float)

    if z.ndim == 1:
        z = np.expand_dims(z, axis=-1)

    return x, z


In [None]:
# Load data
train_path = os.path.join('.', 'train.csv')
x_all, z_all = load_gmm_dataset(train_path)
assert len(x_all) == len(z_all)


In [26]:
# Create randomised permutation index
randomised_idx = np.random.permutation(len(x_all)) #https://stackoverflow.com/questions/43229034/randomly-shuffle-data-and-labels-from-different-files-in-the-same-order
print(randomised_idx)

[619  88 675 668 661 184 407 285 791 109 485 476 270 647 899 326 153 404
 900 226 155 695 777 568 237  93  69 677 985 157 784 673 870 552 635 766
 389 903 443 709 163 704 398 686 788 559 479 995 730 782 530 514 606 396
 543 974 142 722 620 483 360 331 221  42 971 509 776 804 691   0 633 461
 906 195 139 355 747 423 283 409 667 526 961 569   6 223 537 718 923 138
 325 371 507 452 215 956 499 848 374 228 651 282 315 150  66 300 456 276
 889 981 115 239 598 755 734 124 357 795 884 317 397 203 654 590 663 587
 291  22 290 107 588 292 786 666 649 301 105 850 799 180 591 126 690 487
 574 330 943 436  55 749 272 498 963 992 696 350 287 934 932 553 339 865
 173  64 827 122 980 477 348 478  40   1 736 135 286 429 522 334 201 197
 425 265 462 599 815 299 596 681 855 603  14 345 523 586 131 646 753 393
 230 813 196 390  18 136 836 849 492 162 368 674 108  48 931 756 954 513
 281 854 908 146  24  34 296 207 232 822 939 807 742  81 441 313 565 445
 168 448 821 469 615 244 205 342 891 496 557  49 22

In [31]:
# Shuffle
x, z = x_all[randomised_idx], z_all[randomised_idx]

# Split
x1, x2, x3, x4 =  np.array_split(x, 4)
z1, z2, z3, z4 =  np.array_split(z, 4)


In [None]:
# Process labelled data
randomised_idx = np.random.permutation(len(x_tilde))
x_tilde, z_tilde = x_tilde[randomised_idx], z_tilde[randomised_idx]
x_tilde1, xx_tilde, x_tilde3, x_tilde4 =  np.array_split(x_tilde, K)
z_tilde1, z_tilde2, z_tilde3, z_tilde4 =  np.array_split(z_tilde, K)

# Process unlabelled data
randomised_idx = np.random.permutation(len(x))
x = x[randomised_idx]
x1, x2, x3, x4 =  np.array_split(x, K)

data1 = np.mean([x_tilde, x1])