In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from sklearn.preprocessing import LabelEncoder
import os
import pickle
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [32]:
# Paths to data
data_pkls_path = '/home/tico/Desktop/master_classes/project/orbits_4-5_4'
behavior_path = '/home/tico/Desktop/master_classes/project/behavior/'

# Load file names
file_names = os.listdir(data_pkls_path)
behavior_files = os.listdir(behavior_path)

# Load behavior data
behavior_source = pd.read_csv(os.path.join(behavior_path, behavior_files[0]), sep='\t')
for behavior_file in behavior_files[1:]:
    curr_behavior_source = pd.read_csv(os.path.join(behavior_path, behavior_file), sep='\t')
    behavior_source = pd.concat([behavior_source, curr_behavior_source], axis=0)
behavior_source = behavior_source[["session_id", "Group"]]

# Load pickle files and ensure consistent shape
data_list = []
data_list_cnn = []
labels = []
for file_name in file_names:
    session_id = file_name.split('.')[0]
    with open(os.path.join(data_pkls_path, file_name), 'rb') as file:
        data = pickle.load(file)
    if not isinstance(data, np.ndarray):
        data = np.array(data)
    data_list_cnn.append(data)
    data_sum = data.sum(axis=0)
    data_list.append(data_sum)
    label = behavior_source.loc[behavior_source['session_id'] == session_id, 'Group'].values[0]
    labels.append(label)

# Convert lists to numpy arrays
X = np.array(data_list)
y = np.array(labels)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Normalize the CNN data
data_list_cnn = np.array(data_list_cnn, dtype='float32')
data_list_cnn /= data_list_cnn.max()  # Normalize to the range [0, 1]

# Split data into training and testing sets for RandomForest, XGBoost, and KNN
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train RandomForest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Predictions and evaluation for RandomForest
y_train_pred_rf = rf_clf.predict(X_train)
y_test_pred_rf = rf_clf.predict(X_test)
rf_train_accuracy = accuracy_score(y_train, y_train_pred_rf)
rf_test_accuracy = accuracy_score(y_test, y_test_pred_rf)
rf_train_f1 = f1_score(y_train, y_train_pred_rf, average='weighted')
rf_test_f1 = f1_score(y_test, y_test_pred_rf, average='weighted')

# Train XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_clf.fit(X_train, y_train)

# Predictions and evaluation for XGBoost
y_train_pred_xgb = xgb_clf.predict(X_train)
y_test_pred_xgb = xgb_clf.predict(X_test)
xgb_train_accuracy = accuracy_score(y_train, y_train_pred_xgb)
xgb_test_accuracy = accuracy_score(y_test, y_test_pred_xgb)
xgb_train_f1 = f1_score(y_train, y_train_pred_xgb, average='weighted')
xgb_test_f1 = f1_score(y_test, y_test_pred_xgb, average='weighted')

# Train K-Nearest Neighbors classifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)

# Predictions and evaluation for KNN
y_train_pred_knn = knn_clf.predict(X_train)
y_test_pred_knn = knn_clf.predict(X_test)
knn_train_accuracy = accuracy_score(y_train, y_train_pred_knn)
knn_test_accuracy = accuracy_score(y_test, y_test_pred_knn)
knn_train_f1 = f1_score(y_train, y_train_pred_knn, average='weighted')
knn_test_f1 = f1_score(y_test, y_test_pred_knn, average='weighted')

# Split data into training and testing sets for CNN
data_list_cnn = data_list_cnn[..., np.newaxis]  # Add a channel dimension
X_train_CNN, X_test_CNN, y_train_CNN, y_test_CNN = train_test_split(data_list_cnn, y_categorical, test_size=0.2, random_state=42)

# Build CNN model
input_shape = X_train_CNN.shape[1:]  # Update input_shape to include the channel dimension
model = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),
    Conv2D(64, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(np.unique(y_encoded)), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_CNN, y_train_CNN, epochs=10, batch_size=6, validation_split=0.2)

# Evaluate the model
cnn_train_loss, cnn_train_accuracy = model.evaluate(X_train_CNN, y_train_CNN, verbose=0)
cnn_test_loss, cnn_test_accuracy = model.evaluate(X_test_CNN, y_test_CNN, verbose=0)

# Predictions and evaluation for CNN
y_train_pred_cnn = model.predict(X_train_CNN)
y_test_pred_cnn = model.predict(X_test_CNN)
cnn_train_f1 = f1_score(np.argmax(y_train_CNN, axis=1), np.argmax(y_train_pred_cnn, axis=1), average='weighted')
cnn_test_f1 = f1_score(np.argmax(y_test_CNN, axis=1), np.argmax(y_test_pred_cnn, axis=1), average='weighted')

# Print the results
print("RandomForest Classifier:")
print(f"Train Accuracy: {rf_train_accuracy:.4f}, Train F1 Score: {rf_train_f1:.4f}")
print(f"Test Accuracy: {rf_test_accuracy:.4f}, Test F1 Score: {rf_test_f1:.4f}")

print("\nXGBoost Classifier:")
print(f"Train Accuracy: {xgb_train_accuracy:.4f}, Train F1 Score: {xgb_train_f1:.4f}")
print(f"Test Accuracy: {xgb_test_accuracy:.4f}, Test F1 Score: {xgb_test_f1:.4f}")

print("\nK-Nearest Neighbors Classifier:")
print(f"Train Accuracy: {knn_train_accuracy:.4f}, Train F1 Score: {knn_train_f1:.4f}")
print(f"Test Accuracy: {knn_test_accuracy:.4f}, Test F1 Score: {knn_test_f1:.4f}")

print("\nConvolutional Neural Network:")
print(f"Train Accuracy: {cnn_train_accuracy:.4f}, Train F1 Score: {cnn_train_f1:.4f}")
print(f"Test Accuracy: {cnn_test_accuracy:.4f}, Test F1 Score: {cnn_test_f1:.4f}")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.2859 - loss: 1.3859 - val_accuracy: 0.4020 - val_loss: 1.3760
Epoch 2/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2911 - loss: 1.3834 - val_accuracy: 0.3922 - val_loss: 1.3622
Epoch 3/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2837 - loss: 1.3824 - val_accuracy: 0.4118 - val_loss: 1.3437
Epoch 4/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3070 - loss: 1.3636 - val_accuracy: 0.3922 - val_loss: 1.3509
Epoch 5/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3261 - loss: 1.3727 - val_accuracy: 0.4118 - val_loss: 1.3400
Epoch 6/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3116 - loss: 1.3611 - val_accuracy: 0.3922 - val_loss: 1.3416
Epoch 7/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━

In [41]:
def calculate_agreement_arith(orbits1, orbits2):
    # Ensure the orbit counts are in numpy arrays for easier manipulation
    orbits1 = np.array(orbits1, dtype=float)
    orbits2 = np.array(orbits2, dtype=float)

    # Avoid log of zero by replacing zero values with a small number
    orbits1[orbits1 == 0] = 1e-10
    orbits2[orbits2 == 0] = 1e-10

    # Calculate the logarithm of orbit frequencies
    log_orbits1 = np.log(orbits1)
    log_orbits2 = np.log(orbits2)

    # Calculate orbit agreement Ai for each orbit type
    Ai = 1 - np.sqrt(0.5 * np.sum((log_orbits1 - log_orbits2)**2, axis=1))
    # Make sure Ai does not contain invalid values for the geometric mean
    Ai[Ai <= 0] = 1e-10  # Replace non-positive values with a small positive number
    # Arithmetic and geometric means of Ai
    A_arithmetic = np.mean(Ai)

    return 1-A_arithmetic
def calculate_agreement_geom(orbits1, orbits2):
    # Ensure the orbit counts are in numpy arrays for easier manipulation
    orbits1 = np.array(orbits1, dtype=float)
    orbits2 = np.array(orbits2, dtype=float)

    # Avoid log of zero by replacing zero values with a small number
    orbits1[orbits1 == 0] = 1e-10
    orbits2[orbits2 == 0] = 1e-10

    # Calculate the logarithm of orbit frequencies
    log_orbits1 = np.log(orbits1)
    log_orbits2 = np.log(orbits2)

    # Calculate orbit agreement Ai for each orbit type
    Ai = 1 - np.sqrt(0.5 * np.sum((log_orbits1 - log_orbits2)**2, axis=1))
    # Make sure Ai does not contain invalid values for the geometric mean
    Ai[Ai <= 0] = 1e-10  # Replace non-positive values with a small positive number
    # Arithmetic and geometric means of Ai
    A_geometric = np.prod(np.power(Ai, 1/len(Ai)))

    return 1-A_geometric
def custom_distance_wrapper(metric_func):
    def wrapped_metric(x, y):
        # Reshape the flattened arrays back to their original 2D shape
        original_shape = (x.size // 2, 2)
        x_reshaped = x.reshape(original_shape)
        y_reshaped = y.reshape(original_shape)
        # Apply the custom distance function
        return metric_func(x_reshaped, y_reshaped)
    return wrapped_metric


In [44]:
# Paths to data
data_pkls_path = '/home/tico/Desktop/master_classes/project/orbits_4-2_4'
behavior_path = '/home/tico/Desktop/master_classes/project/behavior/'

# Load file names
file_names = os.listdir(data_pkls_path)
behavior_files = os.listdir(behavior_path)

# Load behavior data
behavior_source = pd.read_csv(os.path.join(behavior_path, behavior_files[0]), sep='\t')
for behavior_file in behavior_files[1:]:
    curr_behavior_source = pd.read_csv(os.path.join(behavior_path, behavior_file), sep='\t')
    behavior_source = pd.concat([behavior_source, curr_behavior_source], axis=0)
behavior_source = behavior_source[["session_id", "Group"]]

# Load pickle files and ensure consistent shape
data_list = []
labels = []
for file_name in file_names:
    session_id = file_name.split('.')[0]
    with open(os.path.join(data_pkls_path, file_name), 'rb') as file:
        data = pickle.load(file)
    if not isinstance(data, np.ndarray):
        data = np.array(data)
    data_list.append(data)
    label = behavior_source.loc[behavior_source['session_id'] == session_id, 'Group'].values[0]
    labels.append(label)

# Convert lists to numpy arrays
X = np.array(data_list)
y = np.array(labels)
# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Flatten the 2D data instances into 1D arrays
X_flattened = X.reshape(X.shape[0], -1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_flattened, y_encoded, test_size=0.2, random_state=42)

# Create wrapped distance functions for KNeighborsClassifier
wrapped_calculate_agreement_arith = custom_distance_wrapper(calculate_agreement_arith)
wrapped_calculate_agreement_geom = custom_distance_wrapper(calculate_agreement_geom)

# Train K-Nearest Neighbors classifier with arithmetic agreement
knn_clf_arith = KNeighborsClassifier(n_neighbors=6, metric=wrapped_calculate_agreement_arith)
knn_clf_arith.fit(X_train, y_train)

# Train K-Nearest Neighbors classifier with geometric agreement
knn_clf_geom = KNeighborsClassifier(n_neighbors=6, metric=wrapped_calculate_agreement_geom)
knn_clf_geom.fit(X_train, y_train)

# Predictions and evaluation for KNN with arithmetic agreement
y_train_pred_knn_arith = knn_clf_arith.predict(X_train)
y_test_pred_knn_arith = knn_clf_arith.predict(X_test)
knn_train_accuracy_arith = accuracy_score(y_train, y_train_pred_knn_arith)
knn_test_accuracy_arith = accuracy_score(y_test, y_test_pred_knn_arith)
knn_train_f1_arith = f1_score(y_train, y_train_pred_knn_arith, average='weighted')
knn_test_f1_arith = f1_score(y_test, y_test_pred_knn_arith, average='weighted')
print("\nK-Nearest Neighbors Classifier (Arithmetic):")
print(f"Train Accuracy: {knn_train_accuracy_arith:.4f}, Train F1 Score: {knn_train_f1_arith:.4f}")
print(f"Test Accuracy: {knn_test_accuracy_arith:.4f}, Test F1 Score: {knn_test_f1_arith:.4f}")

# Predictions and evaluation for KNN with geometric agreement
y_train_pred_knn_geom = knn_clf_geom.predict(X_train)
y_test_pred_knn_geom = knn_clf_geom.predict(X_test)
knn_train_accuracy_geom = accuracy_score(y_train, y_train_pred_knn_geom)
knn_test_accuracy_geom = accuracy_score(y_test, y_test_pred_knn_geom)
knn_train_f1_geom = f1_score(y_train, y_train_pred_knn_geom, average='weighted')
knn_test_f1_geom = f1_score(y_test, y_test_pred_knn_geom, average='weighted')
print("\nK-Nearest Neighbors Classifier (Geometric):")
print(f"Train Accuracy: {knn_train_accuracy_geom:.4f}, Train F1 Score: {knn_train_f1_geom:.4f}")
print(f"Test Accuracy: {knn_test_accuracy_geom:.4f}, Test F1 Score: {knn_test_f1_geom:.4f}")


K-Nearest Neighbors Classifier (Arithmetic):
Train Accuracy: 0.5098, Train F1 Score: 0.4932
Test Accuracy: 0.2656, Test F1 Score: 0.2558

K-Nearest Neighbors Classifier (Geometric):
Train Accuracy: 0.4824, Train F1 Score: 0.4641
Test Accuracy: 0.3281, Test F1 Score: 0.3180
