In [21]:
import os
import pickle
import time
import numpy as np
from scipy.ndimage import gaussian_filter1d
from scipy.stats import zscore
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


In [None]:
def smooth_with_gaussian(data, sigma=2):
    return gaussian_filter1d(data, sigma=sigma, axis=1)

def preprocess(data):
    stds = np.std(data[:, :], axis=0)
    non_constant_cols = stds.astype(float) > 1e-6    #finds the time points where std is not 0
    const_cols = stds.astype(float) <= 1e-6    #finds the time points where std is 0

    z = np.zeros_like(data[:, :])   #creates an array of zeros with the same shape as the data
    z[:, non_constant_cols] = zscore(data[:, non_constant_cols], axis=0)  #in the columns where std is not 0, zscores the data
    z[:, const_cols] = np.mean(data[:, const_cols], axis=0)

 
    if np.isnan(z).any():
        raise ValueError("Data contains NaN values after normalization.")

    return z

def load_and_split(file_numbers):
    l_al = []
    l_ar = []
    r_al = []
    r_ar = []
    total_time = time.time()
    for file_number in  file_numbers:
        file_total = time.time()
        file_path = f'C:/Users/joshu/PartIIIProject/RSNNdale_attention_{file_number}_attention_test'
        load_data_start_time = time.time()
        data = pickle.load(open(file_path, 'rb'))
        elapsed_time = time.time() - load_data_start_time
        print(f"Dataset {file_number} loaded in {elapsed_time:.2f} seconds")

        attend_01 = data['attend'][0]
        omitted = data['omit'][0]
        relevant = np.where(omitted ==0)[0]

        left_input_SP = data['SP'][0][0][relevant]
        right_input_SP = data['SP'][0][1][relevant]

        sigma = 2
        left_sm = smooth_with_gaussian(left_input_SP, sigma=sigma) 
        right_sm = smooth_with_gaussian(right_input_SP, sigma=sigma) 

        num_trials, num_samples, num_neurons = left_input_SP.shape
        num_neurons_attention = 80

        for j in range(0, num_trials):
            for i in range(0, num_neurons):
                count_left = np.count_nonzero(left_input_SP[j, :, i] == 1)
                if count_left > 0:
                    left_sm[j, :, i] /= count_left
                count_right = np.count_nonzero(right_input_SP[j, :, i] == 1)
                if count_right > 0:
                    right_sm[j, :, i] /= count_right

        left_input_SP = np.sum(left_sm, axis=2)
        right_input_SP = np.sum(right_sm, axis=2)

        left_input_SP = preprocess(left_input_SP)
        right_input_SP = preprocess(right_input_SP)


        #preprocess here now that we have traces of all of the relavant trials
        left_indices_agg = np.where((omitted ==0) & (attend_01 == 0))[0]  #indices of agg where left
        _, left_indices, _ = np.intersect1d(relevant, left_indices_agg, return_indices = True)   #indices for relevant processed data where attention left
        right_indices_agg = np.where((omitted ==0) & (attend_01 == 1))[0]
        _, right_indices, _ = np.intersect1d(relevant, right_indices_agg, return_indices = True)

        #splitting left and right
        l_al.append(left_input_SP[left_indices, 100:350])
        r_al.append(right_input_SP[left_indices, 100:350])

        l_ar.append(left_input_SP[right_indices, 100:350])
        r_ar.append(right_input_SP[right_indices, 100:350])
        

        print(f"Dataset {file_number} processed in {time.time() - file_total:.2f} seconds")
    print(f"All datasets processed in {time.time() - total_time:.2f} seconds")


In [28]:
load_and_split([1, 2, 4, 8])

Dataset 1 loaded in 51.02 seconds
Dataset 1 processed in 65.29 seconds
Dataset 2 loaded in 23.90 seconds
Dataset 2 processed in 36.17 seconds
Dataset 4 loaded in 15.26 seconds
Dataset 4 processed in 31.66 seconds
Dataset 8 loaded in 18.10 seconds
Dataset 8 processed in 28.43 seconds
All datasets processed in 161.70 seconds


In [29]:
l_al_flat = np.concatenate(l_al, axis=0)
l_ar_flat = np.concatenate(l_ar, axis=0)
r_al_flat = np.concatenate(r_al, axis=0)
r_ar_flat = np.concatenate(r_ar, axis=0)

X_in  = np.concatenate([l_al_flat, r_ar_flat], axis=0)
# Out‐condition: unattended
X_out = np.concatenate([r_al_flat, l_ar_flat], axis=0)

y_in  = np.concatenate([
    np.zeros(l_al_flat.shape[0], dtype=int),
    np.ones( r_ar_flat.shape[0], dtype=int)
])
# same labels for out
y_out = np.concatenate([
    np.zeros(r_al_flat.shape[0], dtype=int),
    np.ones( l_ar_flat.shape[0], dtype=int)
])


In [30]:
def compare_lda_in_out(X_in, y_in, X_out, y_out, test_size=0.2, random_state=42):
    results = {}
    for name, (X, y) in [('In', (X_in, y_in)), ('Out', (X_out, y_out))]:
        # flatten is already done; split
        Xtr, Xte, ytr, yte = train_test_split(
            X, y, test_size=test_size, 
            random_state=random_state, stratify=y
        )
        # scale
        scaler = StandardScaler().fit(Xtr)
        Xtr_s = scaler.transform(Xtr)
        Xte_s = scaler.transform(Xte)
        # fit LDA
        lda = LinearDiscriminantAnalysis(solver = 'lsqr', shrinkage = 'auto').fit(Xtr_s, ytr)
        # predict
        yhat   = lda.predict(Xte_s)
        yprob  = lda.predict_proba(Xte_s)[:,1]
        # metrics
        acc  = accuracy_score(yte, yhat)
        auc  = roc_auc_score(yte, yprob)
        results[name] = {'accuracy': acc, 'roc_auc': auc}
        print(f"{name}-condition LDA →  Acc: {acc:.3f},  AUC: {auc:.3f}")
    return results

# Example:
results = compare_lda_in_out(X_in, y_in, X_out, y_out)
# Example usage:
# X: shape (n_trials, 250, 80)
# y: binary labels array of shape (n_trials,)
# lda_model, scaler, lda_metrics = run_lda(X, y)


In-condition LDA →  Acc: 0.460,  AUC: 0.457
Out-condition LDA →  Acc: 0.473,  AUC: 0.455


In [32]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, roc_auc_score

def compare_lda_in_out(X_in, y_in, X_out, y_out,
                       test_size=0.2, random_state=42,
                       pca_components=[50, 100, 200],
                       shrinkages=['auto', 0.1, 0.5]):
    """
    Compare LDA performance on attended (in) vs. unattended (out) channels,
    using PCA + shrinkage LDA.
    
    Parameters:
    - X_in, X_out: arrays of shape (n_trials, n_features) or (n_trials, n_times, n_neurons)
    - y_in, y_out: binary labels arrays of shape (n_trials,)
    - pca_components: list of PCA n_components to grid-search
    - shrinkages: list of shrinkage values for LDA ('auto' or float)
    
    Returns:
    - results: dict with best estimators and metrics for 'In' and 'Out'
    """
    results = {}
    for name, (X, y) in [('In', (X_in, y_in)), ('Out', (X_out, y_out))]:
        # If 3D inputs, flatten to (n_trials, n_times * n_neurons)
        if X.ndim == 3:
            n_trials = X.shape[0]
            X_flat = X.reshape(n_trials, -1)
        else:
            X_flat = X

        # Split
        Xtr, Xte, ytr, yte = train_test_split(
            X_flat, y, test_size=test_size,
            random_state=random_state, stratify=y
        )
        
        # Pipeline: scale -> PCA -> LDA with shrinkage
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('pca',    PCA()),
            ('lda',    LinearDiscriminantAnalysis(solver='lsqr'))
        ])
        
        param_grid = {
            'pca__n_components': pca_components,
            'lda__shrinkage':    shrinkages
        }
        
        grid = GridSearchCV(pipe, param_grid, cv=4,
                            scoring='roc_auc', n_jobs=-1)
        grid.fit(Xtr, ytr)
        
        # Evaluate
        yprob = grid.predict_proba(Xte)[:,1]
        yhat  = grid.predict(Xte)
        acc   = accuracy_score(yte, yhat)
        auc   = roc_auc_score(yte, yprob)
        
        print(f"{name}-condition LDA → Acc: {acc:.3f}, AUC: {auc:.3f}, "
              f"best PCA={grid.best_params_['pca__n_components']}, "
              f"shrinkage={grid.best_params_['lda__shrinkage']}")
        
        results[name] = {
            'best_estimator': grid.best_estimator_,
            'accuracy': acc,
            'roc_auc': auc,
            'best_params': grid.best_params_
        }
        
    return results

# Example usage:
results = compare_lda_in_out(X_in, y_in, X_out, y_out)



In-condition LDA → Acc: 0.465, AUC: 0.469, best PCA=100, shrinkage=0.5
Out-condition LDA → Acc: 0.488, AUC: 0.478, best PCA=50, shrinkage=0.5
