In [91]:
import os
import re
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [92]:
# Loading data
# seedvals = [58,133,192,339,377,395]
seedvals = [8,17,58,70,71,74,133,136,192,209,216,271,339,376,377,378,395,396,397];
data = np.empty((59,399,19))
for i in range(len(seedvals)):
    path = f"../midprocessing/correlationz_seed{seedvals[i]}.csv"
    df = pd.read_csv(path, header=None)
    data[:,:,i] = df

labels = np.array([1,2,1,1,2,2,2,3,2,1,1,2,2,1,1,3,3,3,3,2,2,1,3,2,3,2,2,2,3,3,3,3,3,2,2,2,2,2,2,1,2,1,1,2,1,2,1,1,1,1,2,1,1,1,1,1,1,3,1])

In [93]:
# seed_train_test_data = np.empty
seed_train_test_data = []
for i in range(len(seedvals)):
    next_seed = train_test_split(data[:,:,i], labels, test_size=0.3, random_state=42)
    seed_train_test_data.append(next_seed)
    


In [169]:
models = []

for X_train, X_test, y_train, y_test in seed_train_test_data:
    forest = RandomForestClassifier(random_state=42, bootstrap=False, max_depth=None, max_features='sqrt', min_samples_leaf=5, min_samples_split=10, n_estimators=60)
    forest.fit(X_train, y_train)
    models.append(forest)

In [170]:
test_sets = [data[1] for data in seed_train_test_data]  # Extracting X_test sets
num_participants = test_sets[0].shape[0]

# Collect predictions from each model
all_predictions = np.zeros((num_participants, len(models)))
for i, model in enumerate(models):
    all_predictions[:, i] = model.predict(test_sets[i])

# Use majority voting for final prediction
final_predictions, _ = mode(all_predictions, axis=1)
final_predictions = final_predictions.flatten()

  final_predictions, _ = mode(all_predictions, axis=1)


In [171]:
y_test_ensemble = seed_train_test_data[0][3]  # Assuming the same test labels for all seeds
ensemble_accuracy = accuracy_score(y_test_ensemble, final_predictions)
print(f"Ensemble model accuracy: {ensemble_accuracy}")

Ensemble model accuracy: 0.3888888888888889


In [172]:
for i, (X_train, X_test, y_train, y_test) in enumerate(seed_train_test_data):
    y_pred = models[i].predict(X_test)
    print(f"Accuracy for seed {i+1}: {accuracy_score(y_test, y_pred)}")

Accuracy for seed 1: 0.5555555555555556
Accuracy for seed 2: 0.3333333333333333
Accuracy for seed 3: 0.4444444444444444
Accuracy for seed 4: 0.3333333333333333
Accuracy for seed 5: 0.5
Accuracy for seed 6: 0.4444444444444444
Accuracy for seed 7: 0.3888888888888889
Accuracy for seed 8: 0.16666666666666666
Accuracy for seed 9: 0.1111111111111111
Accuracy for seed 10: 0.5
Accuracy for seed 11: 0.6111111111111112
Accuracy for seed 12: 0.2222222222222222
Accuracy for seed 13: 0.16666666666666666
Accuracy for seed 14: 0.3333333333333333
Accuracy for seed 15: 0.3333333333333333
Accuracy for seed 16: 0.3888888888888889
Accuracy for seed 17: 0.2222222222222222
Accuracy for seed 18: 0.2777777777777778
Accuracy for seed 19: 0.2222222222222222


In [173]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [50, 60, 75, 90, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 8, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize the classifier
clf = RandomForestClassifier(random_state=42)

# Set up the grid search
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data (using one of the seed data splits as an example)
X_train, X_test, y_train, y_test = seed_train_test_data[18]  # Assuming this is one of your seed splits
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Use the best estimator for further predictions
best_clf = grid_search.best_estimator_

Best parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best score: 0.5277777777777778


In [None]:
# 19 seed regions
{'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 75}
{'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 60}
{'bootstrap': True, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 90}
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
{'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
{'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 75}
{'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 90}
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
{'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 60}
{'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
{'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 50}
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
{'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 75}
{'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 60}
{'bootstrap': True, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}

In [None]:
# 6 seed regions
{'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 75}
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
{'bootstrap': True, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
{'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 75}
{'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 100}


params = ('bootstrap'=False, 'max_depth'=None, 'max_features'=75, 'min_samples_leaf'=1, 'min_samples_split'=5, 'n_estimators'=75)