In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [5]:
# Load the Kinyarwanda TSV dataset
data = pd.read_csv('/content/kr_train.tsv', sep='\t')

# Split the dataset into features and target
X = data['tweet']
y = data['label']

# Split the dataset into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=11)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (3417947838.py, line 2)

In [None]:
def lada_kinyarwanda_tsv(X_train, y_train, model, window_size=5, num_augmented_samples=10):
    """
    Implement LADA approach for the Kinyarwanda TSV dataset.

    Args:
        X_train (csr_matrix): Training Kinyarwanda text data (TF-IDF vectorized).
        y_train (pandas.Series): Training target.
        model: The machine learning model to be used for LADA.
        window_size (int): Size of the look-ahead window.
        num_augmented_samples (int): Number of augmented samples to generate.

    Returns:
        csr_matrix: Augmented training Kinyarwanda text data.
        pandas.Series: Augmented training target.
    """
    # Initialize lists to store augmented data
    X_augmented_data = []
    X_augmented_indices = []
    y_augmented = []  # Use a list to store augmented labels

    for i in range(num_augmented_samples):
        # Generate a random index within the training set
        idx = np.random.randint(0, X_train.shape[0])

        # Get the look-ahead window (adjusting for CSR matrix format)
        start_idx = X_train.indptr[idx]
        end_idx = X_train.indptr[idx + window_size] if idx + window_size < X_train.shape[0] else X_train.indptr[-1]

        # Compute the uncertainty score for each sample in the look-ahead window
        uncertainty_scores = [uncertainty_score(model, X_train[idx + j]) for j in range(window_size) if idx + j < X_train.shape[0]]

        # Select the most informative sample(s) based on the uncertainty scores
        most_informative_idx = np.argsort(uncertainty_scores)[-1]

        # Extract the most informative sample using CSR matrix properties
        most_informative_sample_data = X_train.data[start_idx + most_informative_idx : end_idx]
        most_informative_sample_indices = X_train.indices[start_idx + most_informative_idx : end_idx]

        # Append to augmented data
        X_augmented_data.extend(most_informative_sample_data)
        X_augmented_indices.extend(most_informative_sample_indices)
        y_augmented.append(y_train.iloc[idx + most_informative_idx]) # Append the corresponding label to the list

    # Calculate X_augmented_indptr directly from X_train.indptr
    X_augmented_indptr = [0]
    cumulative_count = 0
    for i in range(len(y_augmented)):
        cumulative_count += X_train.indptr[i+1] - X_train.indptr[i]
        X_augmented_indptr.append(cumulative_count)

    # Create a new CSR matrix from the augmented data
    X_augmented = csr_matrix((X_augmented_data, X_augmented_indices, X_augmented_indptr), shape=(len(y_augmented), X_train.shape[1]))

    # Convert the list of augmented labels to a Pandas Series
    y_augmented = pd.Series(y_augmented)

    return X_augmented, y_augmented

In [None]:
def benchmark_model(model, X_train, y_train, X_val, y_val, X_test, y_test):
    """
    Benchmark the performance of a given model.

    Args:
        model: The machine learning model to be evaluated.
        X_train (csr_matrix): Training Kinyarwanda text data.
        y_train (pandas.Series): Training target.
        X_val (csr_matrix): Validation Kinyarwanda text data.
        y_val (pandas.Series): Validation target.
        X_test (csr_matrix): Test Kinyarwanda text data.
        y_test (pandas.Series): Test target.

    Returns:
        dict: A dictionary containing the evaluation metrics.
    """
    # Train the model
    model.fit(X_train, y_train)

     # Encode labels if necessary (for KMeans or other clustering models)
    if hasattr(model, 'predict_proba'):  # Check if the model supports probability prediction
        val_pred = model.predict(X_val)
        test_pred = model.predict(X_test)
    else:
        # Handle models that don't have predict_proba (e.g., KMeans)
        label_encoder = LabelEncoder()
        y_train = label_encoder.fit_transform(y_train)
        y_val = label_encoder.transform(y_val)
        y_test = label_encoder.transform(y_test)
        val_pred = model.predict(X_val)
        test_pred = model.predict(X_test)

    # Evaluate the model on the validation set
    val_acc = accuracy_score(y_val, val_pred)
    val_f1 = f1_score(y_val, val_pred, average='weighted')

    # Evaluate the model on the test set
    test_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, test_pred)
    test_f1 = f1_score(y_test, test_pred, average='weighted')

    return {
        'val_accuracy': val_acc,
        'val_f1': val_f1,
        'test_accuracy': test_acc,
        'test_f1': test_f1
    }

In [None]:
def uncertainty_score(model, X):
    """
    Compute the entropy-based uncertainty score for a given sample.

    Args:
        model: The machine learning model to be used for uncertainty estimation.
        X (csr_matrix): The input sample.

    Returns:
        float: The uncertainty score for the input sample.
    """
    # Standardize the input sample (if applicable to your model)
    # scaler = StandardScaler(with_mean=False)
    # X_scaled = scaler.fit_transform(X)

    # Compute the probability of each class
    if hasattr(model, 'predict_proba'):
        y_pred_prob = model.predict_proba(X)[0]  # Assuming a single sample
    else:
        # Handle models without predict_proba (e.g., KMeans)
        distances = model.transform(X)[0]
        y_pred_prob = np.exp(-distances) / np.sum(np.exp(-distances))  # Softmax-like probabilities

    # Calculate entropy
    entropy = -np.sum(y_pred_prob * np.log2(y_pred_prob + 1e-10))  # Add a small value to avoid log(0)

    return entropy

In [None]:
'''def uncertainty_score(model, X):
    """
    Compute the uncertainty score for a given sample.

    Args:
        model: The machine learning model to be used for uncertainty estimation.
        X (csr_matrix): The input sample.

    Returns:
        float: The uncertainty score for the input sample.
    """
    # Standardize the input sample
    scaler = StandardScaler(with_mean=False)
    X_scaled = scaler.fit_transform(X)

    # For KMeans, calculate distance to cluster centers as a measure of uncertainty
    distances = model.transform(X_scaled)[0]  # Calculate distances to all cluster centers
    uncertainty_score = distances.min()  # Use the distance to the closest cluster

    return uncertainty_score '''

In [None]:
# Benchmark the model without data augmentation

#model = LogisticRegression()
model = KMeans(n_clusters = 5, random_state=11)
baseline_metrics = benchmark_model(model, X_train, y_train, X_val, y_val, X_test, y_test)
print("Baseline metrics:", baseline_metrics)

# Benchmark the model with LADA data augmentation
X_train_augmented, y_train_augmented = lada_kinyarwanda_tsv(X_train, y_train, model)
#model = LogisticRegression()
model = KMeans(n_clusters = 5, random_state=11)
augmented_metrics = benchmark_model(model, X_train_augmented, y_train_augmented, X_val, y_val, X_test, y_test)
print("Augmented metrics:", augmented_metrics)