In [29]:
import numpy as np 

from sklearn.datasets import load_iris
from sklearn.datasets import load_wine


seed_value = 42   ## An arbitrary seed value

In [30]:
wine = load_wine()        ## To make experiments on other datasetsi alter here !!!!

print('Wine dataset shape:', wine.data.shape)
print("wine target shape:", wine.target.shape)
print('Unique classes: ',np.unique(wine.target))  ##These unique classes stands for cultivars which the wine is obtained from.
print("Instances per class:", np.bincount(wine.target))


Wine dataset shape: (178, 13)
wine target shape: (178,)
Unique classes:  [0 1 2]
Instances per class: [59 71 48]


In [31]:
from sklearn.preprocessing import StandardScaler
## Extracting features and labels

X = wine.data
y = wine.target

#  scaler = StandardScaler()
#  X = scaler.fit_transform(X)

## After scaling, the error rates went even lower than those in the original paper for some datasets.


print("X:  ", X)
print("y: " , y)

X:   [[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]
y:  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [32]:
def select_5_percent_samples(X, y, seed):

    np.random.seed(seed)  ## Seed has been fixed to 42 in the beginning of the code.

    # Determine the number of samples to select from each class (5%)
    num_samples_per_class = {label: int(np.ceil(0.05 * np.sum(y == label))) for label in np.unique(y)}

    ## For every unique class, I choose the (ceiling_%5) amount of instances. 
    ## I tried using floor function but the instances per class came out to be too low to get meaningful results.

    # Initialize arrays to store the indices of labeled and unlabeled data
    labeled_indices = np.array([], dtype=int)
    unlabeled_indices = np.array([], dtype=int)

    # Select 5% from each class as labeled data
    for label in np.unique(y):
        indices = np.where(y == label)[0] 
        
## This will give me the indices of the instances which belong to the class 'label', which is a number from 0 to 2.
        np.random.shuffle(indices)
        label_indices = indices[:num_samples_per_class[label]]
        unlabeled_indices = np.concatenate((unlabeled_indices, indices[num_samples_per_class[label]:]))
        labeled_indices = np.concatenate((labeled_indices, label_indices))

    # Split the data into labeled and unlabeled sets
    X_labeled = X[labeled_indices]
    y_labeled = y[labeled_indices]
    
    X_unlabeled = X[unlabeled_indices]
    y_unlabeled = y[unlabeled_indices]
    
    return X_labeled, y_labeled, X_unlabeled, y_unlabeled

#### comments on   select_5_percent_samples(X, y, seed)
    The function above selects 5% of the samples from each class as labeled data and the remaining as unlabeled data. 
    (As suggested in the paper for building the experiments)
    This is because we are working on semi-supervised area, we need to convert our dataset accordingly
    
    Parameters:

    - X: numpy array, input features.
    - y: numpy array, input labels.
    - seed: int, seed value for reproducibility.
    
    Returns:
    
    - X_labeled: numpy array, labeled data.
    - y_labeled: numpy array, labels for the labeled data.
    - X_unlabeled: numpy array, unlabeled data.
    - y_unlabeled: numpy array, labels for the unlabeled data.
    

In [33]:
from collections import Counter
from scipy.spatial.distance import cdist
from sklearn.metrics import accuracy_score



def euclidean_distance_classifier(X_labeled, y_labeled, X_unlabeled, y_unlabeled, seed):
    distances = cdist(X_unlabeled, X_labeled, metric='euclidean')
    
## = the Euc. dist between each instance in the unlabeled data and each instance in the labeled data.
    
    k = 1 # Assign labels based on majority vote of k-nearest neighbors (k=1)
    y_pred_unlabeled_euc = []

    for dist_row in distances:
        nearest_indices = np.argsort(dist_row)[:k]
        nearest_labels = y_labeled[nearest_indices]
        majority_label = Counter(nearest_labels).most_common(1)[0][0]
        y_pred_unlabeled_euc.append(majority_label)

    error_rate_euc = 1 - accuracy_score(y_unlabeled, y_pred_unlabeled_euc)
    ## Error rate is 1 - accuracy score of predicted unlabeled data.
        
    return error_rate_euc

#### comments of euclidean_distance_classifier :

    For every unlabeled instance, it finds the nearest labeled instance and assigns the label of the nearest labeled instance to that.

     This process comes from the assumption of "what is closest to you must be similar to you".
  
     After the process, it computes the error rate of the Euclidean distance classifier on unlabeled data.
    
    Parameters:
    - X_labeled: numpy array, labeled data.
    - y_labeled: numpy array, labels for the labeled data.
    - X_unlabeled: numpy array, unlabeled data.
    - y_unlabeled: numpy array, labels for the unlabeled data.
    - seed: int, seed value for reproducibility.
    
    Returns:
    - error_rate_euc: float, error rate of the Euclidean distance classifier.

In [34]:
from metric_learn import LMNN 

def apply_lmnn(X_labeled, y_labeled, K=3, max_iter=1000):
    
    ## After max_iter=1000, the error rate was very similar to max_iter=10000, so I chose 1000 to save time.
    lmnn = LMNN(k=K, max_iter=max_iter)
    lmnn.fit(X_labeled, y_labeled)

    return lmnn


#### comments on apply_lmnn : 

    This function creates an instance lmnn from the LMNN class which is imported from the metric_learn library.
    The fit method of the LMNN class trains the model using labeled datas.

    To summerize, this applies the Large Margin Nearest Neighbor (LMNN) algorithm to learn a distance metric.
    
    Parameters:
    - X_labeled: numpy array, labeled data.
    - y_labeled: numpy array, labels for the labeled data.
    - K: int, number of nearest neighbors to consider.
    - max_iter: int, maximum number of iterations for convergence.
        -set to 1000 since it was in a good spot between time efficiency and convergence to limit value. 
    
    Returns:
    - lmnn: an LMNN object, trained LMNN model.



In [35]:
from sklearn.neighbors import KNeighborsClassifier

def evaluate_lmnn(lmnn, X_labeled, y_labeled, X_unlabeled, y_unlabeled):

    # Transform labeled data using the learned metric
    X_labeled_transformed = lmnn.transform(X_labeled)
    
    # Train a k-nearest neighbors (k=1) classifier on transformed labeled data
    knn_classifier = KNeighborsClassifier(n_neighbors=1)
    knn_classifier.fit(X_labeled_transformed, y_labeled)

    # Transform unlabeled data using the learned metric
    X_unlabeled_transformed = lmnn.transform(X_unlabeled)
    
    # Predict labels for transformed unlabeled data
    y_pred_unlabeled = knn_classifier.predict(X_unlabeled_transformed)
    
    # Calculate error rate on unlabeled data
    unlabeled_error_rate = 1 - accuracy_score(y_unlabeled, y_pred_unlabeled)
    
    return unlabeled_error_rate 

    Evaluate the error rate of LMNN on labeled and unlabeled data.
    

    The LMNN algorithm learns a transformation of the feature space that brings instances of the same class closer together while pushing instances of different classes fUrther apart. This applies the transformation to the labeled data, making it more suitable for classification with KNN.

    Parameters:
    - lmnn: LMNN object, trained LMNN model.
    - X_labeled: numpy array, labeled data.
    - y_labeled: numpy array, labels for the labeled data.
    - X_unlabeled: numpy array, unlabeled data.
    - y_unlabeled: numpy array, labels for the unlabeled data.
    
    Returns:
    - unlabeled_error_rate: float, error rate of LMNN on unlabeled data.

In [36]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

## Metric_learn library used to print a future warning every single time for each experiment ...
## ... so I had to add this for clearer looking output.


* ##  Two compared methods' experiments are done. 

* ###  Now lets put S3ML (Semi Supervised Sparse Metric Learning) algorithm into code!



##         S3ML:   (p.7)
     Input: 
     Three sets X, S, D, an integer k, 
     four real-valued parameters 0 < α < 1, θ > 0, β > 0 and ρ > 0, 
     and input metric matrix M0.

     Output: The sparse metric matrix M

## Step 1: k-NN Search
#### comments on construct_neighborhood_indicator_matrix:
    
    Construct the neighborhood indicator matrix P for labeled samples.
    
    Parameters:
    - X: numpy array, input features.
    
    Returns:
    - P: numpy array, neighborhood indicator matrix.

     the Output, P matrix, it looks like a transition probability matrix which is familiar to us from Markov chains.
     Sum of all individual rows are always equal to 1. 
    


In [37]:
from sklearn.neighbors import NearestNeighbors

k = 6 ## As sugested in the paper, page 7

def construct_neighborhood_indicator_matrix(X):

    # Initialize the nearest neighbors model and fit it to the labeled data
    nn = NearestNeighbors(algorithm='auto', metric="euclidean").fit(X)

    distances, indices = nn.kneighbors(X) ## For this matrix, we are not interested in distances, only indices.

    # Initialize the neighborhood indicator matrix P with zeros
    P = np.zeros((len(X), len(X)))

    # Fill in the neighborhood indicator matrix P for the labeled samples
    for i in range(len(X)):
        P[i, indices[i]] = 1 / k
    
    return P


#### For demonstration:

In [38]:
# Select 5% of the samples from each class as labeled
X_labeled, y_labeled, X_unlabeled, y_unlabeled = select_5_percent_samples(X, y, seed_value)

# Construct neighborhood indicator matrix P for the labeled data
P = construct_neighborhood_indicator_matrix(X)

print("P: ", P)  # Show the constructed matrix P for verification

P:  [[0.16666667 0.         0.         ... 0.         0.         0.        ]
 [0.         0.16666667 0.         ... 0.         0.         0.        ]
 [0.         0.         0.16666667 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.16666667 0.16666667 0.        ]
 [0.         0.         0.         ... 0.16666667 0.16666667 0.        ]
 [0.         0.         0.         ... 0.         0.         0.16666667]]


# Step 2: Affinity Propogation 

#### comments on affinity_propogation:  
     Interestingly, the proposed S3ML algorithm can also work under supervised settings when simply setting W = W0 
     so S3ML is appropriate for various metric learning problems. (paper, p.6)


In [39]:
alpha = 0.5  ## Damping factor
teta = 0.01  ## Treshold value to determine the strong affinities.
## These two paramerer values are also mentioned in the paper, p.7 


def affinity_propagation(P, y_labeled, alpha, teta):
    # Initialize the affinity matrix W_0

    n_samples = P.shape[0]
    W_0 = np.zeros((n_samples, n_samples)) ## np.eye(n_samples) ## da olabilir. Tekrar iyi bak.
    
    ## Initially creates a matix full of zeros with the respective dimensions
    
    labeled_indices = np.where(y_labeled != -1)[0]

    # Set affinities for similar and dissimilar pairs
    for i, idx_i in enumerate(labeled_indices):
        for j, idx_j in enumerate(labeled_indices):
            if y_labeled[i] == y_labeled[j]: ## Initially we mark similar points as if in 
                W_0[idx_i, idx_j] = 1  ## Similar pairs, as written in the paper
            else:
                W_0[idx_i, idx_j] = -1  ## Dissimilar pairs
    
    ## W0 has been created using %5 labeled indices. 
    ## Now we need to propagate the affinities through the neighborhood structure.

    # Propagate affinities through the neighborhood structure
    W = np.zeros_like(W_0) ##Creates an empty matrix with the same dimensions as W_0.


    for _ in range(n_samples):
        W = (1 - alpha) * W_0 + alpha * np.dot(P, W)
        
        # Apply the threshold to determine strong affinities
        W[np.abs(W) < teta] = 0
    
    return W

#### comments on def affinity_propagation:

    Perform affinity propagation based on the neighborhood structure and labeled data.
    
    Parameters:
    - P: numpy array, neighborhood indicator matrix.
    - y_labeled: numpy array, labels for the labeled samples.
    - alpha: float, damping factor for affinity propagation.
    - threshold: float, threshold to determine strong affinities.
    
    Returns:
    - W: numpy array, the final affinity matrix after propagation.


In [40]:
def step_2(W,X):

    W1 = np.sum(W, axis=1)  # This computes the sum of each row (axis=1 for rows)
    D = np.diag(W1)  # This creates a diagonal matrix D from the vector W1

    # Compute the graph Laplacian L
    L = D - W

    # Compute the matrix T
    T = X.T @ L @ X  ##  @ stands for matrix multiplication in numpy

    return T


## Now set, Σ = np.linalg.inv(M0) + beta * T  

#### For demonstration of W and D

In [41]:
# Use the function to perform affinity propagation

alpha = 0.5  # Example value for the damping factor
threshold = 0.01  # Example threshold value for strong affinities

W = affinity_propagation(P, y_labeled, alpha, teta)
T = step_2(W,X) ## It is basically an (n * n) square matrix where n = #features of X


## Verify the affinity matrix

print("W: ",W)  
print("T: ", T)

print("P shape: ", P.shape)
print("W shape: ", W.shape)
print("T shape: ", T.shape)



W:  [[0.55961542 0.55961542 0.55961542 ... 0.         0.         0.        ]
 [0.42764579 0.42764579 0.42764579 ... 0.         0.         0.        ]
 [0.56818182 0.56818182 0.56818182 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
T:  [[-9.54568428e+00 -6.08554032e+00  1.12907058e+01 -9.22562531e+01
   6.81648480e+02 -1.03504556e+00  7.21936637e+00  2.76313960e-01
   1.21207111e+01 -2.04612111e+01 -4.03095201e+00  4.08978234e+01
  -5.11712340e+03]
 [ 2.21622812e+00 -8.23761702e+00  4.16420269e-01 -2.79527470e+01
   8.79313623e+01  4.52387227e-01  1.45524621e+00 -1.51501332e-01
   9.39567509e-01  2.95012153e+00 -2.66628867e-01  5.92544342e+00
  -4.69749019e+02]
 [ 5.28707754e-01 -2.95295538e+00  1.51490911e-01 -2.98445964e+01
   8.17996735e+01 -9.18279040e-01  9.85603175e-0

In [42]:
def h_sigma(M, rho, sigma):
    # Apply Nesterov's smoothing technique
    ## to smooth the l1 term
    U_star = np.minimum(rho, np.maximum(M / sigma, -rho))
    
    return np.trace(U_star.T @ M) - (sigma / 2) * np.linalg.norm(U_star, 'fro')**2

* #### Comments on h_σ :
    The smoothed h function using Nesterov's technique.
    
    Parameters:
    - M: The matrix variable of the optimization problem.
    - rho: The sparsity parameter.
    - sigma: The smoothness parameter.
    
    Returns:
    - The value of the smoothed h function.

In [43]:
def grad_h_sigma(M, rho, sigma):

    return np.minimum(rho, np.maximum(M / sigma, -rho))

* #### Comments on grad_hσ :
    Gradient of the smoothed h function.
    
    Parameters:
    - M: The matrix variable of the optimization problem.
    - rho: The sparsity parameter.
    - sigma: The smoothness parameter.
    
    Returns:
    - The gradient of the smoothed h function.

In [44]:

def f(M, Sigma):
    fM = -np.log(np.linalg.det(M)) + np.trace(Sigma @ M)

    return fM

def grad_f(M,Σ):
    grad_fM = -np.linalg.inv(M) + Σ

    return grad_fM

def Σ(X,y_labeled, alpha, beta):
    M0 = np.eye(X.shape[1])  # Initialize M0 as the identity matrix
    P = construct_neighborhood_indicator_matrix(X)
    W = affinity_propagation(P, y_labeled, alpha, teta)
    T = step_2(W)
    Σ = np.linalg.inv(M0) + beta * T  

    return Σ


    Since we defined the neccessary functions, we can move forward to Step 3

## Step 3: ALM

In [45]:
def ALM(M_init, Y_init, mu, rho, sigma, Σ, max_iter=500):

    M = M_init
    Y = Y_init
    
    for _ in range(max_iter):

        # Step 1: Update M
        grad_h_Y = grad_h_sigma(Y, rho, sigma)
        M_next = M - mu * (grad_h_Y + grad_f(M,Σ))
        
        # Step 2: Update Y
        grad_f_M_next = grad_f(M_next,Σ)
        Y_next = Y - mu * (grad_f_M_next + grad_h_sigma(M_next, rho, sigma))
        
        
        if np.linalg.norm(M_next - M, 'fro') < 1e-6:
            break
        
        # Prepare for next iteration
        M = M_next
        Y = Y_next
    
    return M
    

### For Iris, tuning parameter should be rho, beta = 100,   800
### For wine ;                           rho, beta = 10000, 2 
###### For wine ; rho, beta = 100000,11 === %10.7 (initial)

## Tuning parameters below (rho and beta )

In [46]:

rho = 10000 ## Tuning (smoothness parameter)
beta = 2   ## Tuning parameter


In [47]:
mu = 0.000001  # Update step size
sigma = 0.000001   # Smoothness parameter


def run_s3ml(X_labeled, y_labeled, X_unlabeled, y_unlabeled, M0, Y0, mu, rho, sigma, beta):

    # Construct neighborhood indicator matrix P for the labeled data
    P = construct_neighborhood_indicator_matrix(X)
    # Apply affinity propagation to obtain the affinity matrix W
    W = affinity_propagation(P, y_labeled, alpha, teta)
    # Compute matrix T
    T = step_2(W,X)

    # Set Σ = np.linalg.inv(M0) + beta * T  
    Σ = np.linalg.inv(M0) + beta * T

    # Run ALM to obtain the sparse metric matrix M
    M = ALM( M0, Y0, mu, rho, sigma, Σ)
    
    knn = KNeighborsClassifier(n_neighbors=1, metric="mahalanobis" , metric_params={'VI': np.linalg.inv(M)})
    knn.fit(X_labeled, y_labeled) ## Model training phase with the help of labeled data points.

    # Predict labels for unlabeled data
    y_pred = knn.predict(X_unlabeled)
    # Calculate accuracy score and error rate
    accuracy = accuracy_score(y_unlabeled, y_pred)
    error_rate = 1 - accuracy
    print("S3ML Error rate: ", error_rate)

    return error_rate



In [48]:
def experiment_LMNN(experiment_number, LMNN_avg_error_rate, seed_value):

    for _ in range(experiment_number):
        X_labeled, y_labeled, X_unlabeled, y_unlabeled = select_5_percent_samples(X, y, seed_value)


        # Train LMNN on labeled data
        lmnn_model = apply_lmnn(X_labeled, y_labeled)

        # Evaluate LMNN on both labeled and unlabeled data
        unlabeled_error_rate = evaluate_lmnn(lmnn_model, X_labeled, y_labeled, X_unlabeled, y_unlabeled)

        LMNN_avg_error_rate += unlabeled_error_rate
        seed_value += 1 
        
        print("LMNN Error rate (trial ", _+1 , "):",  unlabeled_error_rate)
        ## In every iteration we need to increase seed by one to maintain both randomness and stability.
    print("Average error rate of LMNN method after " , experiment_number ,"tests: ", LMNN_avg_error_rate/experiment_number)


#### comments on experiment_LMNN    
    Perform experiments to evaluate the error rates of LMNN and Euclidean distance classifiers.
    
    Parameters:
    - experiment_number: int, number of experiments to perform.
    - LMNN_avg_error_rate: float, average error rate of LMNN.
    - EU_avg_error_rate: float, average error rate of Euclidean distance classifier.


In [49]:
def experiment_EU(experiment_number, EU_avg_error_rate, seed_value):

    for _ in range(experiment_number):
        X_labeled, y_labeled, X_unlabeled, y_unlabeled = select_5_percent_samples(X, y, seed_value)
        error_rate_euc = euclidean_distance_classifier(X_labeled, y_labeled, X_unlabeled, y_unlabeled,seed_value)
        EU_avg_error_rate += error_rate_euc
        seed_value += 1
        print("EU Error rate (trial", _+1 , "):", error_rate_euc)


    print("Average error rate of EU method after " , experiment_number ,"tests:   ", EU_avg_error_rate/experiment_number)


In [50]:
experiment_number =50 ## In the paper, 50 experiments are made from every algorithm.

X_labeled, y_labeled, X_unlabeled, y_unlabeled = select_5_percent_samples(X, y, seed_value)
print("Experiment is going to occur with those valuees accordingly.")
print(" X labeled: ", X_labeled.shape,"\n", "y labeled: ", y_labeled.shape,"\n",
       "X unlabeled: ", X_unlabeled.shape, "\n", "y unlaneled: ",y_unlabeled.shape, "\n")


print("EXPERIMENT RESULTS:")
experiment_LMNN(experiment_number, 0, seed_value) 


Experiment is going to occur with those valuees accordingly.
 X labeled:  (10, 13) 
 y labeled:  (10,) 
 X unlabeled:  (168, 13) 
 y unlaneled:  (168,) 

EXPERIMENT RESULTS:


LMNN Error rate (trial  1 ): 0.13095238095238093
LMNN Error rate (trial  2 ): 0.26190476190476186
LMNN Error rate (trial  3 ): 0.10119047619047616
LMNN Error rate (trial  4 ): 0.20833333333333337
LMNN Error rate (trial  5 ): 0.24404761904761907
LMNN Error rate (trial  6 ): 0.1964285714285714
LMNN Error rate (trial  7 ): 0.19047619047619047
LMNN Error rate (trial  8 ): 0.10119047619047616
LMNN Error rate (trial  9 ): 0.1964285714285714
LMNN Error rate (trial  10 ): 0.32738095238095233
LMNN Error rate (trial  11 ): 0.11309523809523814
LMNN Error rate (trial  12 ): 0.0535714285714286
LMNN Error rate (trial  13 ): 0.20833333333333337
LMNN Error rate (trial  14 ): 0.32738095238095233
LMNN Error rate (trial  15 ): 0.22619047619047616
LMNN Error rate (trial  16 ): 0.18452380952380953
LMNN Error rate (trial  17 ): 0.24404761904761907
LMNN Error rate (trial  18 ): 0.2678571428571429
LMNN Error rate (trial  19 ): 0.23809523809523814
LMNN Error rate (trial  20 ): 0.20238095238095233
LMNN Error ra

In [51]:
experiment_EU(experiment_number, 0, seed_value + experiment_number) 
## I added the # experiments made so far to seed value to maintain randomness.


## The results are very close to the results in the paper.
## The last two steps takes 30 secs on average (for Wine dataset)
##      ""         ""       2 secs on average (for Iris dataset)


EU Error rate (trial 1 ): 0.3035714285714286
EU Error rate (trial 2 ): 0.3392857142857143
EU Error rate (trial 3 ): 0.3214285714285714
EU Error rate (trial 4 ): 0.27380952380952384
EU Error rate (trial 5 ): 0.32738095238095233
EU Error rate (trial 6 ): 0.2857142857142857
EU Error rate (trial 7 ): 0.34523809523809523
EU Error rate (trial 8 ): 0.33333333333333337
EU Error rate (trial 9 ): 0.38095238095238093
EU Error rate (trial 10 ): 0.3035714285714286
EU Error rate (trial 11 ): 0.38095238095238093
EU Error rate (trial 12 ): 0.31547619047619047
EU Error rate (trial 13 ): 0.2678571428571429
EU Error rate (trial 14 ): 0.38690476190476186
EU Error rate (trial 15 ): 0.2678571428571429
EU Error rate (trial 16 ): 0.34523809523809523
EU Error rate (trial 17 ): 0.35119047619047616
EU Error rate (trial 18 ): 0.35119047619047616
EU Error rate (trial 19 ): 0.29166666666666663
EU Error rate (trial 20 ): 0.3035714285714286
EU Error rate (trial 21 ): 0.29166666666666663
EU Error rate (trial 22 ): 0.3

In [52]:
def experiment_S3ML(experiment_number,  seed_value,mu, rho, sigma, beta, avg_S3ML_error_rate):
    for _ in range(experiment_number):
        X_labeled, y_labeled, X_unlabeled, y_unlabeled = select_5_percent_samples(X, y, seed_value)
        M0 = np.identity(X.shape[1])
        Y0 = np.zeros_like(M0)

        avg_S3ML_error_rate += run_s3ml(X_labeled, y_labeled, X_unlabeled, y_unlabeled, M0, Y0, mu, rho, sigma, beta)
        seed_value += 1
        
    avg_S3ML_error_rate = avg_S3ML_error_rate/experiment_number

    print("Average error rate of S3ML after ", experiment_number ,"tests: ", avg_S3ML_error_rate)


experiment_S3ML(50,  seed_value, mu, rho, sigma, beta,0)
## 7 seconds on Wine and 0.5 seconds on Iris.


S3ML Error rate:  0.07738095238095233
S3ML Error rate:  0.13095238095238093
S3ML Error rate:  0.1071428571428571
S3ML Error rate:  0.1071428571428571
S3ML Error rate:  0.24404761904761907
S3ML Error rate:  0.11309523809523814
S3ML Error rate:  0.1607142857142857
S3ML Error rate:  0.07738095238095233
S3ML Error rate:  0.1785714285714286
S3ML Error rate:  0.15476190476190477
S3ML Error rate:  0.11904761904761907
S3ML Error rate:  0.0714285714285714
S3ML Error rate:  0.13690476190476186
S3ML Error rate:  0.1071428571428571
S3ML Error rate:  0.11904761904761907
S3ML Error rate:  0.125
S3ML Error rate:  0.13095238095238093
S3ML Error rate:  0.17261904761904767
S3ML Error rate:  0.1785714285714286
S3ML Error rate:  0.1964285714285714
S3ML Error rate:  0.1785714285714286
S3ML Error rate:  0.11309523809523814
S3ML Error rate:  0.24404761904761907
S3ML Error rate:  0.125
S3ML Error rate:  0.1428571428571429
S3ML Error rate:  0.20238095238095233
S3ML Error rate:  0.11904761904761907
S3ML Error r

####    The algorithm below is for searching optimal tuning parameters (rho and beta). 
####    I limited the domain to [1, 1000] for simplicity, the real domain is 1 to 1 million and step size multiplier is x1.2
####    (That's why the error rate is higher than our findings)


In [55]:
###     Grid search (extra):

def search_optimal_parameters(X, y, seed_value, mu, sigma, start_value=1, max_value=1000, experiment_number=3):
    best_rho = start_value
    best_beta = start_value
    best_error_rate = float('inf')

    rho = start_value
    beta = start_value

    while rho <= max_value or beta <= max_value:
        avg_error_rate = 0
        for _ in range(experiment_number):
            X_labeled, y_labeled, X_unlabeled, y_unlabeled = select_5_percent_samples(X, y, seed_value)
            M0 = np.identity(X.shape[1])
            Y0 = np.zeros_like(M0)

            error_rate = run_s3ml(X_labeled, y_labeled, X_unlabeled, y_unlabeled, M0, Y0, mu, rho, sigma, beta)
            seed_value += 1
            avg_error_rate += error_rate

        avg_error_rate /= experiment_number

        print(f"rho: {rho}, beta: {beta}, avg error rate: {avg_error_rate}")

        if avg_error_rate < best_error_rate:
            best_error_rate = avg_error_rate
            best_rho = rho
            best_beta = beta

        if rho <= max_value:
            rho *= 2
        elif beta <= max_value:
            beta *= 2
            rho = 1  # Reset rho to 1 when beta increases

        if rho > max_value and beta > max_value:
            break

    return best_rho, best_beta, best_error_rate

# Initial parameters
mu = 1e-6
sigma = 1e-6
seed_value = 42


best_rho, best_beta, best_error_rate = search_optimal_parameters(X, y, seed_value, mu, sigma)
print(f"Best rho: {best_rho}, Best beta: {best_beta}, Best error rate: {best_error_rate}")



S3ML Error rate:  0.27380952380952384
S3ML Error rate:  0.3571428571428571
S3ML Error rate:  0.25595238095238093
rho: 1, beta: 1, avg error rate: 0.29563492063492064
S3ML Error rate:  0.27380952380952384
S3ML Error rate:  0.3392857142857143
S3ML Error rate:  0.20238095238095233
rho: 2, beta: 1, avg error rate: 0.2718253968253968
S3ML Error rate:  0.1964285714285714
S3ML Error rate:  0.30952380952380953
S3ML Error rate:  0.375
rho: 4, beta: 1, avg error rate: 0.29365079365079366
S3ML Error rate:  0.36904761904761907
S3ML Error rate:  0.14880952380952384
S3ML Error rate:  0.2857142857142857
rho: 8, beta: 1, avg error rate: 0.26785714285714285
S3ML Error rate:  0.22023809523809523
S3ML Error rate:  0.29166666666666663
S3ML Error rate:  0.27380952380952384
rho: 16, beta: 1, avg error rate: 0.2619047619047619
S3ML Error rate:  0.22619047619047616
S3ML Error rate:  0.25595238095238093
S3ML Error rate:  0.2857142857142857
rho: 32, beta: 1, avg error rate: 0.25595238095238093
S3ML Error rate: 