In [None]:
! git clone https://github.com/igemiracle/02620_final_project.git

In [None]:
%cd ..

## SVM

We first try to use gridsearch to find the best parameters for SVM models, and use the best setting to predict test data and evaluate the result.

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np

# List of datasets for processing
datasets = ['baron', 'cellBench', 'zheng']
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['auto', 'scale'],
    'kernel': ['rbf', 'linear']
}

for dataset in datasets:
    print(f"Processing {dataset} dataset")

    # Store the best model, its validation accuracy, and F1-score for each fold
    best_model = None
    highest_valid_acc = 0
    best_f1_score = 0

    for fold in range(1, 11):
        # Load training and validation data
        train_features = pd.read_csv(f"clean_data_pca/{dataset}/fold_{fold}/train_features.csv")
        train_labels = pd.read_csv(f"clean_data_pca/{dataset}/fold_{fold}/train_labels.csv")

        # Grid search with cross-validation
        grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(train_features, train_labels.values.ravel())

        # Calculate F1-score for the best model
        f1 = f1_score(train_labels.values.ravel(), grid_search.predict(train_features), average='macro')

        # Check if there is a higher validation accuracy or F1-score
        if grid_search.best_score_ > highest_valid_acc:
            highest_valid_acc = grid_search.best_score_
            best_model = grid_search.best_estimator_
            best_f1_score = f1

        print(f"Fold {fold}: Best params = {grid_search.best_params_}, Best validation accuracy = {grid_search.best_score_}, F1-Score = {f1}")

    print(f"Best model parameters for {dataset} dataset: {best_model.get_params()}")
    print(f"Highest validation accuracy: {highest_valid_acc}, Best F1-score: {best_f1_score}")


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# List of datasets for processing
datasets = ['baron', 'cellBench', 'zheng']
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['auto', 'scale'],
    'kernel': ['rbf', 'linear']
}

# Initialize lists to store the average errors for plotting
average_train_errors = []
average_test_errors = []

for dataset in datasets:
    print(f"Processing {dataset} dataset")

    # Store the cumulative error for each fold to calculate the average later
    cumulative_train_error = 0
    cumulative_test_error = 0

    for fold in range(1, 11):
        # Load training and validation data
        train_features = pd.read_csv(f"clean_data_pca/{dataset}/fold_{fold}/train_features.csv")
        train_labels = pd.read_csv(f"clean_data_pca/{dataset}/fold_{fold}/train_labels.csv").values.ravel()

        # Grid search with cross-validation
        grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(train_features, train_labels)

        # Calculate and accumulate train error for the fold
        train_error = 1 - grid_search.best_score_
        cumulative_train_error += train_error

        # Validate the best model on the test set and calculate the error
        best_model = grid_search.best_estimator_
        test_error = 1 - accuracy_score(train_labels, best_model.predict(train_features))
        cumulative_test_error += test_error

    # Calculate the average error across all folds
    average_train_error = cumulative_train_error / 10
    average_test_error = cumulative_test_error / 10

    # Append the average errors for the current dataset to the list
    average_train_errors.append(average_train_error)
    average_test_errors.append(average_test_error)

    # Print the average errors for the current dataset
    print(f"{dataset} - Average training error: {average_train_error}")
    print(f"{dataset} - Average test error: {average_test_error}")

# Plotting the average training and test errors for each dataset
x = np.arange(len(datasets))
width = 0.35

fig, ax = plt.subplots()
ax.bar(x - width/2, average_train_errors, width, label='Training Error')
ax.bar(x + width/2, average_test_errors, width, label='Test Error')

ax.set_xlabel('Datasets')
ax.set_ylabel('Error')
ax.set_title('Average Training and Test Errors for SVM')
ax.set_xticks(x)
ax.set_xticklabels(datasets)
ax.legend()

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Training and test errors for SVM and XGBoost
svm_train_errors = [0.08979592802470161, 0.0, 0.09768055555555553]
svm_test_errors = [0.07345808879184441, 0.0, 0.07024999999999999]
xgb_train_errors = [0.0311086797957695, 0.0, 0.062671875]
xgb_test_errors = [0.07221006564551424, 0.0029583657419410825, 0.1434375]

# Dataset names
datasets = ['Baron', 'CellBench', 'Zheng']

# Set up for the bar widths and positions
bar_width = 0.2
index = np.arange(len(datasets))

# Warm color palette
colors = ['#e76f51', '#f4a261', '#e9c46a', '#2a9d8f']  # Adjust the colors to match the image

# Creating the bar plots
plt.figure(figsize=(6, 4))

plt.bar(index, svm_train_errors, bar_width, label='SVM Train Error', color=colors[0])
plt.bar(index + bar_width, svm_test_errors, bar_width, label='SVM Test Error', color=colors[1])

plt.bar(index + bar_width * 2, xgb_train_errors, bar_width, label='XGBoost Train Error', color=colors[2])
plt.bar(index + bar_width * 3, xgb_test_errors, bar_width, label='XGBoost Test Error', color=colors[3])

# Adding labels and titles
plt.xlabel('Datasets')
plt.ylabel('Error')
plt.title('Training and Test Errors for SVM and XGBoost')
plt.xticks(index + bar_width * 1.5, datasets)
plt.legend()

# Output the plot
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import KFold
import os
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
import random

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

datasets_info = [
    {"features": "imm_adata_fselected_107.csv", "labels": "imm_Labels.csv"},
    {"features": "lc_adata_fselected_31.csv", "labels": "lc_Labels.csv"},
    {"features": "pan_adata_fselected_8.csv", "labels": "pan_Labels.csv"}
]

for dataset in datasets_info:
    print(f"Processing dataset with features {dataset['features']}")

    # Load features and labels
    features = pd.read_csv(f"{dataset['features']}")
    labels = pd.read_csv(f"{dataset['labels']}")

    # Use LabelEncoder to convert categorical labels into integers
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels.iloc[:, 0])

    # Create an instance of XGBoost classifier
    model = xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=len(label_encoder.classes_),
        n_estimators=100,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    )

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels_encoded, test_size=0.3, random_state=42)

    # Train model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy and F1-score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    # Output the results
    print(f"Results for {dataset['features'].split('_')[0]} dataset:")
    print(f"Accuracy: {accuracy}")
    print(f"F1-score: {f1}\n")


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

datasets_info = [
    {"features": "imm_adata_fselected_107.csv", "labels": "imm_Labels.csv"},
    {"features": "lc_adata_fselected_31.csv", "labels": "lc_Labels.csv"},
    {"features": "pan_adata_fselected_8.csv", "labels": "pan_Labels.csv"}
]

for dataset in datasets_info:
    print(f"Processing dataset with features {dataset['features']}")

    # Load features and labels
    features = pd.read_csv(f"{dataset['features']}")
    labels = pd.read_csv(f"{dataset['labels']}")

    # Use LabelEncoder to convert categorical labels into integers
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels.iloc[:, 0])

    # Create an instance of SVM classifier
    model = SVC(
        C=1.0,  # Regularization parameter
        kernel='rbf',  # Specifies the kernel type to be used in the algorithm
        gamma='scale',  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
        probability=True,  # Whether to enable probability estimates (needed for F1-score)
        random_state=42  # Random seed for reproducibility
    )

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels_encoded, test_size=0.2, random_state=42)

    # Train model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy and F1-score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    # Output the results
    print(f"Results for {dataset['features'].split('_')[0]} dataset:")
    print(f"Accuracy: {accuracy}")
    print(f"F1-score: {f1}\n")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Data for the heatmap
data = {
    'baron': [0.495124148777054, 0.5017161996733301, 0.6109106426919281],
    'cellBench': [0.6346396368152479, 0.9972228257593857, 0.9960945557360741],
    'Zheng': [0.3181956174307169, 0.9204406786649237, 0.8555613418858459]
}
df = pd.DataFrame(data, index=['K-Means', 'SVM', 'XGBoost'])

# Create the heatmap
plt.figure(figsize=(4, 3))
sns.heatmap(df, annot=True, fmt=".3f", cmap='Oranges')

# Set the labels
plt.title('F1-Score Heatmap')
plt.xlabel('Datasets')
plt.ylabel('Methods')

# Show the plot
plt.show()


But actually, if we just use gridsearch and crossvalidation on train_data, that should be fine.

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# List of datasets for processing
datasets = ['baron', 'cellBench', 'zheng']

# Define the parameter grid for the SVM
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['auto', 'scale'],
    'kernel': ['rbf', 'linear']
}

# Process each dataset
for dataset in datasets:
    print(f"Processing {dataset} dataset")

    # Load training data
    train_features = pd.read_csv(f"clean_data_kmeans/{dataset}/train_features.csv")
    train_labels = pd.read_csv(f"clean_data_kmeans/{dataset}/train_labels.csv")

    # Create an instance of SVC and GridSearchCV for parameter optimization
    svc = SVC()
    grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(train_features, train_labels.values.ravel())

    # Output the best model's parameters and cross-validation accuracy
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_

    print(f"{dataset}: Best model parameters: {best_params}")
    print(f"{dataset}: Best cross-validation accuracy: {best_score}\n")


# XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# List of datasets for processing
datasets = ['baron', 'cellBench', 'zheng']

for dataset in datasets:
    print(f"Processing {dataset} dataset")

    # Load training data
    train_features = pd.read_csv(f"clean_data_kmeans/{dataset}/train_features.csv")
    train_labels = pd.read_csv(f"clean_data_kmeans/{dataset}/train_labels.csv")

    # Use LabelEncoder to convert categorical labels into integers
    label_encoder = LabelEncoder()
    train_labels_encoded = label_encoder.fit_transform(train_labels.iloc[:, 0])  # Assuming labels are in the first column

    # Create an instance of XGBoost classifier
    model = xgb.XGBClassifier(
        objective='multi:softprob',  # Using softmax for multi-class classification
        num_class=len(label_encoder.classes_),  # Number of classes
        n_estimators=100,  # Number of trees
        learning_rate=0.05,  # Learning rate
        max_depth=5,  # Maximum depth of the trees
        random_state=42  # Random seed for reproducibility
    )

    # Use 5-fold cross-validation to evaluate the model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, train_features, train_labels_encoded, cv=kfold, scoring='accuracy')

    # Output the results of cross-validation
    print(f"{dataset}: Cross-validation scores: {scores}")
    print(f"{dataset}: Mean accuracy: {np.mean(scores)}\n")


In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_validate, KFold
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# List of datasets for processing
datasets = ['baron', 'cellBench', 'zheng']

for dataset in datasets:
    print(f"Processing {dataset} dataset")

    # Load training data
    train_features = pd.read_csv(f"clean_data_kmeans/{dataset}/train_features.csv")
    train_labels = pd.read_csv(f"clean_data_kmeans/{dataset}/train_labels.csv")

    # Use LabelEncoder to convert categorical labels into integers
    label_encoder = LabelEncoder()
    train_labels_encoded = label_encoder.fit_transform(train_labels.iloc[:, 0])  # Assuming labels are in the first column

    # Create an instance of XGBoost classifier
    model = xgb.XGBClassifier(
        objective='multi:softprob',  # Using softmax for multi-class classification
        num_class=len(label_encoder.classes_),  # Number of classes
        n_estimators=100,  # Number of trees
        learning_rate=0.05,  # Learning rate
        max_depth=5,  # Maximum depth of the trees
        random_state=42  # Random seed for reproducibility
    )

    # Use 5-fold cross-validation to evaluate the model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # cross_validate allows to retrieve both train and test scores
    scores = cross_validate(model, train_features, train_labels_encoded, cv=kfold, scoring='accuracy', return_train_score=True)

    # Calculate training and testing errors
    train_error = np.mean(1 - scores['train_score'])
    test_error = np.mean(1 - scores['test_score'])

    # Output the results of cross-validation
    print(f"{dataset}: Training error: {train_error}")
    print(f"{dataset}: Test error: {test_error}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Given train and test errors for each dataset
train_errors = [0.0311086797957695, 0.0, 0.062671875]
test_errors = [0.07221006564551424, 0.0029583657419410825, 0.1434375]
datasets = ['baron', 'cellBench', 'zheng']

# Set up the bar width and positions
bar_width = 0.35
index = np.arange(len(datasets))

# Create the bars for train and test errors
plt.bar(index, train_errors, bar_width, label='Train Error', color='blue')
plt.bar(index + bar_width, test_errors, bar_width, label='Test Error', color='orange')

# Add labels and title
plt.xlabel('Datasets')
plt.ylabel('Error')
plt.title('Training and Test Errors for XGBoost across Datasets')
plt.xticks(index + bar_width / 2, datasets)
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, f1_score
import pandas as pd
import numpy as np

# List of datasets for processing
datasets = ['baron', 'cellBench', 'zheng']

for dataset in datasets:
    print(f"Processing {dataset} dataset")

    # Load training data
    train_features = pd.read_csv(f"clean_data_kmeans/{dataset}/train_features.csv")
    train_labels = pd.read_csv(f"clean_data_kmeans/{dataset}/train_labels.csv")

    # Use LabelEncoder to convert categorical labels into integers
    label_encoder = LabelEncoder()
    train_labels_encoded = label_encoder.fit_transform(train_labels.iloc[:, 0])  # Assuming labels are in the first column

    # Create an instance of XGBoost classifier
    model = xgb.XGBClassifier(
        objective='multi:softprob',  # Using softmax for multi-class classification
        num_class=len(label_encoder.classes_),  # Number of classes
        n_estimators=100,  # Number of trees
        learning_rate=0.05,  # Learning rate
        max_depth=5,  # Maximum depth of the trees
        random_state=42  # Random seed for reproducibility
    )

    # Define a custom scorer for F1-score that calculates the macro average F1-score
    f1_scorer = make_scorer(f1_score, average='macro')

    # Use 5-fold cross-validation to evaluate the model using both accuracy and F1-score
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(model, train_features, train_labels_encoded, cv=kfold, scoring='accuracy')
    f1_scores = cross_val_score(model, train_features, train_labels_encoded, cv=kfold, scoring=f1_scorer)

    # Output the results of cross-validation
    print(f"{dataset}: Cross-validation accuracy scores: {accuracy_scores}")
    print(f"{dataset}: Mean accuracy: {np.mean(accuracy_scores)}")
    print(f"{dataset}: Cross-validation F1-scores: {f1_scores}")
    print(f"{dataset}: Mean F1-score: {np.mean(f1_scores)}\n")


# Active Learning: Random Sampling v.s. Uncertainty Sampling

We use AL on the first dataset to see if AL works. We start from 20% of data and end at 33% of data. Since the data has a very big volume, random batch sampling would be faster.

In [None]:
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
import xgboost as xgb

dataset = 'baron'

# Load feature and label data
X = pd.read_csv(f"clean_data_kmeans/{dataset}/train_features.csv")
y = pd.read_csv(f"clean_data_kmeans/{dataset}/train_labels.csv")

# Convert categorical labels to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y.iloc[:, 0])  # This will encode the original classes to integers
y_encoded = pd.DataFrame(y_encoded, columns=['label'])  # Create a DataFrame from the encoded labels
df = pd.concat([X, y_encoded], axis=1)

unique_y_classes = np.unique(y)
label_encoder_classes = label_encoder.classes_

# Check if all unique classes in 'y' are present in the LabelEncoder's classes
if set(unique_y_classes) - set(label_encoder_classes):
    print(f"Classes in 'y' that are not present in LabelEncoder classes: {set(unique_y_classes) - set(label_encoder_classes)}")

# Check if there are any classes in LabelEncoder that are not in 'y'
if set(label_encoder_classes) - set(unique_y_classes):
    print(f"Classes in LabelEncoder that are not present in 'y': {set(label_encoder_classes) - set(unique_y_classes)}")

# Set random seed
seed_num = [40, 41, 42]

# Create list to store the results
result_train_random_batch = []
result_rest_random_batch = []

# Loop over 5 times
for seed in range(len(seed_num)):
    result_train_random_batch.append([])
    result_rest_random_batch.append([])

    np.random.seed(seed_num[seed])
    sdf = df.sample(frac=1).reset_index(drop=True)

    origin_len = sdf['label'].size // 5  # From 20% of data
    half_len = sdf['label'].size // 3  # To 30% of data
    iter_num = (half_len - origin_len) // 3

    dff = sdf.iloc[0:origin_len]
    rest_dataset = sdf.iloc[origin_len:]

    score = []
    score_rest = []
    for i in range(iter_num):
        # Model prediction
        X = dff[dff.columns[0:-1]]
        X_rest = rest_dataset[rest_dataset.columns[0:-1]]
        y = dff[dff.columns[-1]]
        y_rest = rest_dataset[rest_dataset.columns[-1]]

        xgt = xgb.XGBClassifier(n_estimators=10, max_depth = 10)

        # Cross-validate the model
        y_pred = cross_val_predict(xgt, X, y, cv=3)
        # Fit the model
        xgt.fit(X, y)
        y_rest_pred = xgt.predict(X_rest)

        # Calculate accuracy
        acc = metrics.accuracy_score(y, y_pred)
        rest_acc = metrics.accuracy_score(y_rest, y_rest_pred)

        # Store results
        score.append(acc)
        score_rest.append(rest_acc)
        result_train_random_batch[seed].append(acc)
        result_rest_random_batch[seed].append(rest_acc)

        # Choose the next data randomly
        selected_indices = random.sample(range(len(rest_dataset)), 3)
        # Add selected data to the training set
        dff = pd.concat([dff, rest_dataset.iloc[selected_indices]], axis=0)
        rest_dataset = rest_dataset.drop(rest_dataset.index[selected_indices])
        rest_dataset = rest_dataset.reset_index(drop=True)

        # Reset index
        rest_dataset = rest_dataset.reset_index(drop=True)

        datasize = origin_len + 3 * i
        print(datasize, acc)

    # Calculate and print overall online accuracy
    online_acc = np.average(score)
    print('-------------------------------------------------')
    print(score, score_rest)
    print(np.average(score), score_rest)


For uncertainty samping, we also use a more efficient version, which is batch sampling.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
import xgboost as xgb
import random

# Fetch dataset
dataset = 'baron'
X = pd.read_csv(f"clean_data_kmeans/{dataset}/train_features.csv")
y = pd.read_csv(f"clean_data_kmeans/{dataset}/train_labels.csv")

# Convert categorical labels into integers using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y.iloc[:, 0])  # Assuming labels are in the first column
y = pd.DataFrame(y, columns=['label'])
df = pd.concat([X, y], axis=1)

# Define seed numbers
seed_num = [40, 41, 42]

# Initialize lists to store results
result_train_uncertain = []
result_rest_uncertain = []

# Loop through each seed to perform randomized trials
for seed in range(len(seed_num)):
    result_train_uncertain.append([])
    result_rest_uncertain.append([])

    # Set random seed for reproducibility
    np.random.seed(seed_num[seed])
    sdf = df.sample(frac=1).reset_index(drop=True)

    # Define initial training size (20% of data) and target size (50% of data)
    origin_len = sdf['label'].size // 5
    half_len = sdf['label'].size // 3
    iter_num = (half_len - origin_len) // 3

    # Split initial data into training and remaining datasets
    dff = sdf.iloc[0:origin_len]
    rest_dataset = sdf.iloc[origin_len:]

    score = []
    score_rest = []

    # Iteratively train model and evaluate
    for i in range(int(iter_num)):
        # Separate features and labels
        X = dff[dff.columns[0:-1]]
        X_rest = rest_dataset[rest_dataset.columns[0:-1]]
        y = dff[dff.columns[-1]]
        y_rest = rest_dataset[rest_dataset.columns[-1]]

        # Initialize and train XGBoost classifier
        xgt = xgb.XGBClassifier(n_estimators=10, max_depth = 10)
        y_pred = cross_val_predict(xgt, X, y, cv=5)
        xgt.fit(X, y)
        y_rest_pred = xgt.predict(X_rest)

        # Calculate accuracy
        acc = metrics.accuracy_score(y, y_pred)
        rest_acc = metrics.accuracy_score(y_rest, y_rest_pred)

        # Store accuracy results
        score.append(acc)
        score_rest.append(rest_acc)
        result_train_uncertain[seed].append(acc)
        result_rest_uncertain[seed].append(rest_acc)

        # Determine least certain predictions and select data for training
        proba = xgt.predict_proba(rest_dataset[rest_dataset.columns[0:-1]])
        uncertainty = 1 - proba.max(axis=1)  # Calculate uncertainty
        max_positions_batch = np.argpartition(uncertainty, -3)[-3:]  # Get indices of top 3 uncertain samples
        dff = pd.concat([dff, rest_dataset.iloc[max_positions_batch]], axis=0)

        # Update the remaining dataset
        rest_dataset = rest_dataset.drop(rest_dataset.index[max_positions_batch])
        rest_dataset = rest_dataset.reset_index(drop=True)

        # Print current training dataset size and accuracy
        datasize = origin_len + 3 * i
        print(datasize, acc)

    # Calculate and print overall accuracy for online training
    online_acc = np.average(score)
    print('-------------------------------------------------')
    print('Training scores:', score)
    print('Test scores:', score_rest)
    print('Mean training accuracy:', np.average(score), 'Mean test accuracy:', score_rest)


Finally, we could compare the methods, and find that we could use less data to have the same level of accuracy or we could have a better performance model under same amount of data.

In [None]:
avg_rest_uncertain = np.mean(result_rest_uncertain, axis=0)
std_rest_uncertain = np.std(result_rest_uncertain, axis=0)

avg_rest_random = np.mean(result_rest_random_batch, axis=0)
std_rest_random = np.std(result_rest_random_batch, axis=0)

# Assuming 'result_rest_uncertain' is a list of lists where each sub-list represents an experiment
# and contains accuracies for different train data sizes
x1 = np.arange(1371, len(avg_rest_uncertain) * 3 + 1371, 3)  # Adjusted to match the length of the average lists

plt.figure(figsize=(18, 8), dpi=1000)

plt.errorbar(x1, avg_rest_uncertain, yerr=std_rest_uncertain, fmt='s', linestyle='--', capsize=5, label='Rest_uncertain_batch')
plt.errorbar(x1, avg_rest_random, yerr=std_rest_random, fmt='s', ecolor='seagreen', linestyle='--', capsize=5, label='Rest_random_batch')

plt.legend()

# Ensure that the x-ticks correspond to the correct range and step
plt.xticks(np.arange(1371, len(avg_rest_uncertain) * 3 + 1371, step=100))  # x-ticks every 15 units
plt.xlabel('Train Data Size')
plt.ylabel('Accuracy')
plt.title('Test Accuracy Between Uncertainty Batch Sampling and Random Batch Sampling')
plt.grid(True, linestyle='--')

plt.show()
