In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import FeatureAgglomeration

# Read your data
df = pd.read_csv('processed_mrna_zscore.csv')
X = df.iloc[:, :-2]
y = df.iloc[:, -1]

def evaluate_features(X, y, selected_features, classifier, k_folds=10):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    weighted_accuracies = []

    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the classifier on the selected features
        classifier.fit(X_train.iloc[:, selected_features], y_train)

        # Make predictions on the test set
        X_test_transformed = X_test.iloc[:, selected_features]
        y_pred = classifier.predict(X_test_transformed)

        # Calculate and store the weighted accuracy
        weighted_accuracy = accuracy_score(y_test, y_pred, sample_weight=None)
        weighted_accuracies.append(weighted_accuracy)

        # Print class-wise accuracies for each fold
        class_report = classification_report(y_test, y_pred, output_dict=True)
        print(f"\nFold {i + 1} - Class-wise Accuracies:")
        print(class_report)

        print(f"Fold {i + 1} : {weighted_accuracy}")

    # Calculate the average weighted accuracy
    average_weighted_accuracy = np.mean(weighted_accuracies)
    print("Average Weighted Accuracy:", average_weighted_accuracy)

    return average_weighted_accuracy


def feature_selection_feature_agglomeration(X, y, n_clusters=10, k_folds=10):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    # Initialize FeatureAgglomeration
    agglo = FeatureAgglomeration(n_clusters=n_clusters)

    selected_features = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit FeatureAgglomeration on the training data
        X_train_transformed = agglo.fit_transform(X_train)
        X_test_transformed = agglo.transform(X_test)

        # Select the indices of the transformed features
        selected_indices = np.arange(X_train_transformed.shape[1])

        # Accumulate selected features across folds
        selected_features.extend(selected_indices)

    # Get the unique selected features
    selected_features = np.unique(selected_features)

    # Print and return the top features
    print(f"Top features selected by Feature Agglomeration:")
    print(selected_features)

    return selected_features

# Select top features using Feature Agglomeration
selected_features_agglo = feature_selection_feature_agglomeration(X, y, n_clusters=10, k_folds=10)

# Evaluate the selected features using RandomForestClassifier
accuracy_agglo = evaluate_features(X, y, selected_features_agglo, RandomForestClassifier(n_estimators=100, n_jobs=-1))
print(f"Feature Agglomeration Accuracy with selected features: {accuracy_agglo}")


Top features selected by Feature Agglomeration:
[0 1 2 3 4 5 6 7 8 9]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Fold 1 - Class-wise Accuracies:
{'long': {'precision': 0.8666666666666667, 'recall': 0.9883040935672515, 'f1-score': 0.9234972677595628, 'support': 171}, 'medium': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 21}, 'short': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'accuracy': 0.8578680203045685, 'macro avg': {'precision': 0.2888888888888889, 'recall': 0.32943469785575047, 'f1-score': 0.30783242258652094, 'support': 197}, 'weighted avg': {'precision': 0.752284263959391, 'recall': 0.8578680203045685, 'f1-score': 0.801614379628859, 'support': 197}}
Fold 1 : 0.8578680203045685


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Fold 2 - Class-wise Accuracies:
{'long': {'precision': 0.868020304568528, 'recall': 1.0, 'f1-score': 0.9293478260869565, 'support': 171}, 'medium': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 21}, 'short': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'accuracy': 0.868020304568528, 'macro avg': {'precision': 0.2893401015228427, 'recall': 0.3333333333333333, 'f1-score': 0.30978260869565216, 'support': 197}, 'weighted avg': {'precision': 0.75345924914324, 'recall': 0.868020304568528, 'f1-score': 0.8066927830500993, 'support': 197}}
Fold 2 : 0.868020304568528

Fold 3 - Class-wise Accuracies:
{'long': {'precision': 0.868020304568528, 'recall': 1.0, 'f1-score': 0.9293478260869565, 'support': 171}, 'medium': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 21}, 'short': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'accuracy': 0.868020304568528, 'macro avg': {'precision': 0.2893401015228427, 'recall': 0.3333333333333333,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Fold 3 : 0.868020304568528

Fold 4 - Class-wise Accuracies:
{'long': {'precision': 0.8629441624365483, 'recall': 1.0, 'f1-score': 0.9264305177111716, 'support': 170}, 'medium': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 22}, 'short': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'accuracy': 0.8629441624365483, 'macro avg': {'precision': 0.2876480541455161, 'recall': 0.3333333333333333, 'f1-score': 0.3088101725703905, 'support': 197}, 'weighted avg': {'precision': 0.7446726274833158, 'recall': 0.8629441624365483, 'f1-score': 0.7994578071619247, 'support': 197}}
Fold 4 : 0.8629441624365483


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Fold 5 - Class-wise Accuracies:
{'long': {'precision': 0.8673469387755102, 'recall': 1.0, 'f1-score': 0.9289617486338798, 'support': 170}, 'medium': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 22}, 'short': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}, 'accuracy': 0.8673469387755102, 'macro avg': {'precision': 0.2891156462585034, 'recall': 0.3333333333333333, 'f1-score': 0.3096539162112933, 'support': 196}, 'weighted avg': {'precision': 0.7522907122032486, 'recall': 0.8673469387755102, 'f1-score': 0.8057321289171406, 'support': 196}}
Fold 5 : 0.8673469387755102


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Fold 6 - Class-wise Accuracies:
{'long': {'precision': 0.865979381443299, 'recall': 0.9882352941176471, 'f1-score': 0.9230769230769231, 'support': 170}, 'medium': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 22}, 'short': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}, 'accuracy': 0.8571428571428571, 'macro avg': {'precision': 0.288659793814433, 'recall': 0.32941176470588235, 'f1-score': 0.3076923076923077, 'support': 196}, 'weighted avg': {'precision': 0.7511045655375553, 'recall': 0.8571428571428571, 'f1-score': 0.8006279434850864, 'support': 196}}
Fold 6 : 0.8571428571428571


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Fold 7 - Class-wise Accuracies:
{'long': {'precision': 0.8717948717948718, 'recall': 1.0, 'f1-score': 0.9315068493150686, 'support': 170}, 'medium': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 22}, 'short': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}, 'accuracy': 0.8673469387755102, 'macro avg': {'precision': 0.2905982905982906, 'recall': 0.3333333333333333, 'f1-score': 0.31050228310502287, 'support': 196}, 'weighted avg': {'precision': 0.7561486132914704, 'recall': 0.8673469387755102, 'f1-score': 0.8079396142018452, 'support': 196}}
Fold 7 : 0.8673469387755102


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Fold 8 - Class-wise Accuracies:
{'long': {'precision': 0.8666666666666667, 'recall': 0.9941176470588236, 'f1-score': 0.9260273972602739, 'support': 170}, 'medium': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 22}, 'short': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}, 'accuracy': 0.8622448979591837, 'macro avg': {'precision': 0.2888888888888889, 'recall': 0.33137254901960783, 'f1-score': 0.30867579908675796, 'support': 196}, 'weighted avg': {'precision': 0.7517006802721089, 'recall': 0.8622448979591837, 'f1-score': 0.8031870282359519, 'support': 196}}
Fold 8 : 0.8622448979591837


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Fold 9 - Class-wise Accuracies:
{'long': {'precision': 0.8673469387755102, 'recall': 1.0, 'f1-score': 0.9289617486338798, 'support': 170}, 'medium': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 22}, 'short': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}, 'accuracy': 0.8673469387755102, 'macro avg': {'precision': 0.2891156462585034, 'recall': 0.3333333333333333, 'f1-score': 0.3096539162112933, 'support': 196}, 'weighted avg': {'precision': 0.7522907122032486, 'recall': 0.8673469387755102, 'f1-score': 0.8057321289171406, 'support': 196}}
Fold 9 : 0.8673469387755102

Fold 10 - Class-wise Accuracies:
{'long': {'precision': 0.8666666666666667, 'recall': 0.9941176470588236, 'f1-score': 0.9260273972602739, 'support': 170}, 'medium': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 21}, 'short': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'accuracy': 0.8622448979591837, 'macro avg': {'precision': 0.2888888888888889, 'recal

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

def feature_selection_rfe(X, y, n_features_to_select=15, k_folds=10):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    # Initialize RandomForestClassifier
    rf_classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    # Initialize RFE with the classifier
    rfe = RFE(estimator=rf_classifier, n_features_to_select=n_features_to_select)

    selected_features = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit RFE on the training data
        rfe.fit(X_train, y_train)

        # Get the indices of the selected features
        selected_indices = np.where(rfe.support_)[0]

        # Accumulate selected features across folds
        selected_features.extend(selected_indices)

    # Get the unique selected features
    selected_features = np.unique(selected_features)

    # Print and return the top features
    top_features = X.columns[selected_features]
    print(f"Top {n_features_to_select} features selected by RFE:")
    print(top_features)

    return top_features

# Select top features using RFE
selected_features_rfe = feature_selection_rfe(X, y, n_features_to_select=100, k_folds=10)

# Evaluate the selected features using RandomForestClassifier
accuracy_rfe = evaluate_features(X, y, selected_features_rfe, RandomForestClassifier(n_estimators=100, n_jobs=-1))
print(f"RFE Accuracy with selected features: {accuracy_rfe}")
