In [2]:
import sys
!{sys.executable} -m pip install rfpimp



In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from rfpimp import *
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
# Load data
gene_expression = pd.read_csv('../Data/Processed/NSCLC_expression_model_training.csv', index_col='SampID')
clinical_data = pd.read_csv('../Data/Processed/NSCLC_labels_model_training.csv', index_col='SampID')

# Merge datasets
expression_clincov = pd.merge(gene_expression, clinical_data, left_index=True, right_index=True)
expression_condition = pd.merge(gene_expression, clinical_data[['Condition']], on='SampID', how='left')

print(f"Shape of merged data: {expression_condition.shape}")

Shape of merged data: (1109, 6142)


In [5]:
# Encode the "Condition" variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(expression_condition['Condition'])

# Prepare the feature matrix with gene expression only
X = expression_condition.drop(columns=['Condition'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
def chi_squared_feature_selection(X, y, k=100):
    chi2_selector = SelectKBest(chi2, k=k)
    chi2_selector.fit(X, y)
    return X.columns[chi2_selector.get_support()].tolist()

def mutual_information_feature_selection(X, y, k=100):
    mi_selector = SelectKBest(mutual_info_classif, k=k)
    mi_selector.fit(X, y)
    return X.columns[mi_selector.get_support()].tolist()

def random_forest_feature_importance(X, y, k=100):
    model = RandomForestClassifier(random_state=42)
    model.fit(X, y)
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1][:k]
    return X.columns[indices].tolist()

def permutation_feature_importance(X, y, k=100):
    model = RandomForestClassifier(random_state=42)
    model.fit(X, y)
    result = permutation_importance(model, X, y, n_repeats=10, random_state=42)
    sorted_indices = result.importances_mean.argsort()[::-1][:k]
    return X.columns[sorted_indices].tolist()

def rfpimp_feature_selection(X, y_train, X_test, y_test, n_features=100):
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    imp = importances(rf, X_test, y_test, n_samples=-1)
    # Ensure imp is a DataFrame with 'Feature' as index and 'Importance' as a column
    if not isinstance(imp, pd.DataFrame) or 'Importance' not in imp.columns:
        raise ValueError("imp should be a DataFrame with 'Importance' column")
    return imp.nlargest(n_features, 'Importance').index.tolist()

In [7]:
def train_and_evaluate(X_train, X_test, y_train, y_test, features):
    clf = RandomForestClassifier(random_state=42)
    
    # Cross-validation on training data
    cv_scores = cross_val_score(clf, X_train[features], y_train, cv=5)
    
    # Train on full training set
    clf.fit(X_train[features], y_train)
    
    # Predict on test set
    y_pred = clf.predict(X_test[features])
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return {
        'cv_score_mean': np.mean(cv_scores),
        'cv_score_std': np.std(cv_scores),
        'test_accuracy': accuracy,
        'test_precision': precision,
        'test_recall': recall,
        'test_f1': f1
    }

In [17]:
# Main execution
feature_selection_methods = [
    ('Chi-Squared', chi_squared_feature_selection),
    ('Mutual Information', mutual_information_feature_selection),
    ('Random Forest', random_forest_feature_importance),
    ('Permutation', permutation_feature_importance),
    ('RFPIMP', rfpimp_feature_selection)
]

results = {}

for name, method in feature_selection_methods:
    if name == 'RFPIMP':
        selected_features = method(X_train, y_train, X_test, y_test)
    else:
        selected_features = method(X_train, y_train)
    results[name] = train_and_evaluate(X_train, X_test, y_train, y_test, selected_features)

# Display results
for method, metrics in results.items():
    print(f"\nResults for {method}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


Results for Chi-Squared:
cv_score_mean: 0.9402
cv_score_std: 0.0181
test_accuracy: 0.9550
test_precision: 0.9573
test_recall: 0.9550
test_f1: 0.9548

Results for Mutual Information:
cv_score_mean: 0.9504
cv_score_std: 0.0136
test_accuracy: 0.9685
test_precision: 0.9688
test_recall: 0.9685
test_f1: 0.9684

Results for Random Forest:
cv_score_mean: 0.9515
cv_score_std: 0.0162
test_accuracy: 0.9685
test_precision: 0.9694
test_recall: 0.9685
test_f1: 0.9684

Results for Permutation:
cv_score_mean: 0.9414
cv_score_std: 0.0117
test_accuracy: 0.9505
test_precision: 0.9513
test_recall: 0.9505
test_f1: 0.9504

Results for RFPIMP:
cv_score_mean: 0.9459
cv_score_std: 0.0146
test_accuracy: 0.9369
test_precision: 0.9381
test_recall: 0.9369
test_f1: 0.9368


In [10]:
random_forest_features = random_forest_feature_importance(X_train, y_train)
mutual_information_features = mutual_information_feature_selection(X_train, y_train)

In [11]:
len(set(random_forest_features) & set(mutual_information_features))

50