In [1]:
import warnings
warnings.filterwarnings('ignore')

# Model Selection and Evaluation

## Use test set to evaluate the algorithm performance
Set aside a portion of the initially labeled data as test data to evaluate the performance of two algorithms.

We can apply the standard 80-20 train-test split. But since the inital labelded dataset has only 40 data points, we can also apply k-fold cross validation.

I will apply a modified version of k-fold: Apply k-fold on initial labeled dataset, keep one fold as validation set, the other folds and all the unlabeled data as training set.

In [2]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.semi_supervised import LabelPropagation
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

# Load the data
df = pd.read_csv("../data/data_sensors.csv")

# Separate labeled and unlabeled data
labeled_data = df[df['Label'].notna()]
unlabeled_data = df[df['Label'].isna()]

# Features and labels for labeled data
labeled_features = labeled_data.drop(columns=['Label'])
labeled_labels = labeled_data['Label']

# Features for unlabeled data
unlabeled_features = unlabeled_data.drop(columns=['Label'])

# Normalize the sensor features
scaler = MinMaxScaler()
unlabeled_features_scaled = scaler.fit_transform(unlabeled_features)
labeled_features_scaled = scaler.transform(labeled_features)

# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold CV

### Label Propagation with RBF Kernel

In [3]:
f1_scores = []

# Perform k-fold CV on labeled data
for train_index, val_index in kf.split(labeled_features_scaled):
    # Split the labeled data into training and validation sets
    X_train_labeled, X_val = labeled_features_scaled[train_index], labeled_features_scaled[val_index]
    y_train_labeled, y_val = labeled_labels.iloc[train_index], labeled_labels.iloc[val_index]
    
    # Combine labeled training data with unlabeled data for label propagation
    X_train = np.vstack([X_train_labeled, unlabeled_features_scaled])
    y_train = np.concatenate([y_train_labeled, [-1] * len(unlabeled_features_scaled)])  # Mark unlabeled as -1
    
    # Apply Label Propagation
    label_prop_rbf = LabelPropagation(kernel='rbf', gamma=20)
    label_prop_rbf.fit(X_train, y_train)
    
    # Predict labels for the validation set
    y_pred = label_prop_rbf.predict(X_val)
    
    # Compute F1 score for this fold (macro-averaged for multi-class)
    f1 = f1_score(y_val, y_pred, average='macro')
    f1_scores.append(f1)

# Calculate the average F1 score across all folds
average_f1 = np.mean(f1_scores)
print(f"Average F1 Score across {kf.n_splits} folds: {average_f1:.2f}")

Average F1 Score across 5 folds: 0.40


### Label Propagation with KNN Kernel

In [4]:
f1_scores = []

# Perform k-fold CV on labeled data
for train_index, val_index in kf.split(labeled_features_scaled):
    # Split the labeled data into training and validation sets
    X_train_labeled, X_val = labeled_features_scaled[train_index], labeled_features_scaled[val_index]
    y_train_labeled, y_val = labeled_labels.iloc[train_index], labeled_labels.iloc[val_index]
    
    # Combine labeled training data with unlabeled data for label propagation
    X_train = np.vstack([X_train_labeled, unlabeled_features_scaled])
    y_train = np.concatenate([y_train_labeled, [-1] * len(unlabeled_features_scaled)])  # Mark unlabeled as -1
    
    # Apply Label Propagation
    label_prop_knn = LabelPropagation(kernel='knn', n_neighbors=7)
    label_prop_knn.fit(X_train, y_train)
    
    # Predict labels for the validation set
    y_pred = label_prop_knn.predict(X_val)
    
    # Compute F1 score for this fold (macro-averaged for multi-class)
    f1 = f1_score(y_val, y_pred, average='macro')
    f1_scores.append(f1)

# Calculate the average F1 score across all folds
average_f1 = np.mean(f1_scores)
print(f"Average F1 Score across {kf.n_splits} folds: {average_f1:.2f}")

Average F1 Score across 5 folds: 0.16


### Nearest Centroid

In [5]:
f1_scores = []

# Perform k-fold CV on labeled data
for train_index, val_index in kf.split(labeled_features_scaled):
    # Split the labeled data into training and validation sets
    X_train, X_val = labeled_features_scaled[train_index], labeled_features_scaled[val_index]
    y_train, y_val = labeled_labels.iloc[train_index], labeled_labels.iloc[val_index]
    
    # Initialize and train the Nearest Centroid classifier
    nearest_centroid = NearestCentroid()
    nearest_centroid.fit(X_train, y_train)
    
    # Predict labels for the validation set
    y_pred = nearest_centroid.predict(X_val)
    
    # Compute F1 score for this fold (macro-averaged for multi-class)
    f1 = f1_score(y_val, y_pred, average='macro')
    f1_scores.append(f1)

# Calculate the average F1 score across all folds
average_f1 = np.mean(f1_scores)
print(f"Average F1 Score across {kf.n_splits} folds: {average_f1:.2f}")

Average F1 Score across 5 folds: 0.34


From above results, we can see that Label Propagation with RBF Kernel performs the best.