In [189]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot
import seaborn
from sklearn.feature_selection import RFE
import lime.lime_tabular

In [190]:
# Define folder paths
liver_cancer_folder = os.path.join('.', 'liver_cancer')
platforms_folder = os.path.join('.', 'platforms')

# Dictionary to store datasets for different platforms
platform_datasets = {}

# Dictionary to store liver cancer datasets
liver_cancer_datasets = {}


In [196]:
def process_datasets():
    """
    Process datasets from liver cancer folder and platforms folder.
    
    - Reads CSV files from liver_cancer_folder, converts patient status to binary, and normalizes gene expression data.
    - Reads Excel files from platforms_folder and stores them in platform_datasets dictionary.
    
    Returns:
    - liver_cancer_datasets: Dictionary containing processed liver cancer datasets.
    - platform_datasets: Dictionary containing datasets from different platforms.
    """
    for filename in os.listdir(liver_cancer_folder):
        if filename.endswith('.csv'):
            dataset_path = os.path.join(liver_cancer_folder, filename)
            df = pd.read_csv(dataset_path)

            # Convert patient status to binary
            df['type'] = df['type'].apply(lambda x: 1 if x == 'HCC' else 0)

            # Normalize gene expression data
            gene_cols = df.columns.drop(['type', 'samples'])  # exclude 'samples' column
            scaler = MinMaxScaler()
            df[gene_cols] = scaler.fit_transform(df[gene_cols])
            
            liver_cancer_datasets[filename] = df
    
    for filename in os.listdir(platforms_folder):
        if filename.endswith('.xlsx'):
            dataset_path = os.path.join(platforms_folder, filename)
            df = pd.read_excel(dataset_path)
            platform_datasets[filename] = df

    return liver_cancer_datasets, platform_datasets

liver_cancer_datasets, platform_datasets = process_datasets()


In [197]:
# Create lists to store datasets
liver_cancer_platforms = {
    # GPL571	[HG-U133A_2] Affymetrix Human Genome U133A 2.0 Array
    # GPL3921	[HT_HG-U133A] Affymetrix HT Human Genome U133A Array
    'Liver_GSE14520_U133A.csv': 'GPL571-17391.xlsx',
    # GPL570	[HG-U133_Plus_2] Affymetrix Human Genome U133 Plus 2.0 Array 
    'Liver_GSE62232.csv': 'GPL570-55999.xlsx'}

''' breast_cancer_platforms = {
    # GPL570	[HG-U133_Plus_2] Affymetrix Human Genome U133 Plus 2.0 Array
    'Breast_GSE42568.csv': 'GPL570-55999.xlsx',
    'Breast_GSE26910.csv': 'GPL570-55999.xlsx',
    'Breast_GSE45827.csv': 'GPL570-55999.xlsx',
    # GPL13607	Agilent-028004 SurePrint G3 Human GE 8x60K Microarray (Feature Number version)
    'Breast_GSE59246.csv': 'GPL13607-20416.xlsx',
    'Breast_GSE70947.csv': 'GPL13607-20416.xlsx',
}'''

" breast_cancer_platforms = {\n    # GPL570\t[HG-U133_Plus_2] Affymetrix Human Genome U133 Plus 2.0 Array\n    'Breast_GSE42568.csv': 'GPL570-55999.xlsx',\n    'Breast_GSE26910.csv': 'GPL570-55999.xlsx',\n    'Breast_GSE45827.csv': 'GPL570-55999.xlsx',\n    # GPL13607\tAgilent-028004 SurePrint G3 Human GE 8x60K Microarray (Feature Number version)\n    'Breast_GSE59246.csv': 'GPL13607-20416.xlsx',\n    'Breast_GSE70947.csv': 'GPL13607-20416.xlsx',\n}"

In [198]:
def align_datasets_with_platforms():
    for csv_file, xlsx_file in liver_cancer_platforms.items():
        print(f'Processing {csv_file} with {xlsx_file}')  # Add a print statement to see which files are being processed

        # Get CSV dataset
        csv_df = liver_cancer_datasets[csv_file]

        # Get corresponding platform Excel dataset
        xlsx_df = platform_datasets[xlsx_file]

        # Create a mapping from 'ID' to 'GCC' in the platform dataset
        id_to_gcc = dict(zip(xlsx_df['ID'], xlsx_df['GB_ACC']))

        print(f'Finding mapped genes in {csv_file}')
        # Find common features between dataset and platform's IDs
        common_features = set(csv_df.columns).intersection(set(xlsx_df['ID']))

        print(f'Dropped unmapped genes in {csv_file}')
        # Drop columns from dataset that are not in platform's IDs
        columns_to_drop = [col for col in csv_df.columns if col not in common_features]
        csv_df.drop(columns=columns_to_drop, inplace=True)

        print(f'Replacing ID\'s with GB_ACC\'s in {csv_file}')
        # Replace features with 'GCC' value from platform dataset
        for feature in common_features:
            csv_df[feature] = csv_df[feature].map(id_to_gcc)

align_datasets_with_platforms()

Processing Liver_GSE14520_U133A.csv with GPL571-17391.xlsx
Finding mapped genes in Liver_GSE14520_U133A.csv
Dropped unmapped genes in Liver_GSE14520_U133A.csv
Replacing ID's with GB_ACC's in Liver_GSE14520_U133A.csv
Processing Liver_GSE62232.csv with GPL570-55999.xlsx
Finding mapped genes in Liver_GSE62232.csv
Dropped unmapped genes in Liver_GSE62232.csv
Replacing ID's with GB_ACC's in Liver_GSE62232.csv


In [172]:
# Define cancer types
cancer_types = ['liver_cancer'] # , 'breast_cancer']
cancer_folders = [liver_cancer_folder] # , breast_cancer_folder]

# Create a dictionary to hold the classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=5000),
    "Random Forest": RandomForestClassifier(),
}

# Loop through each cancer type
for i in range(len(cancer_types)):

    print(f"Processing {cancer_types[i]} datasets")

    # Create list to store datasets
    datasets = []

    # Process datasets for current cancer type
    for filename in os.listdir(cancer_folders[i]):
        if filename.endswith('.csv'):
            dataset_path = os.path.join(cancer_folders[i], filename)
            df = pd.read_csv(dataset_path)
            datasets.append(df)

    # Store important features for each classifier
    important_features = {
        "Logistic Regression": [],
        "Random Forest": [],
    }

    # Loop through each dataset
    for df in datasets:

        # Remove the first column as it's not a feature
        df = df.drop(df.columns[0], axis=1)

        # Label encoding for the target classes
        le = LabelEncoder()
        df['type'] = le.fit_transform(df['type'])

        # Splitting into features (X) and target (y)
        X = df.drop('type', axis=1)
        y = df['type']

        # Apply PCA for dimensionality reduction
        n_components = min(X.shape[0], X.shape[1])
        pca = PCA(n_components=n_components) # Adjust based on data
        X_pca = pca.fit_transform(X)

        # Splitting the dataset into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=0)

        # Initialize the explainer
        explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=[f'PC{i}' for i in range(X_train.shape[1])], class_names=le.classes_, discretize_continuous=True)

        # Loop through classifiers
        for name, clf in classifiers.items():
            clf.fit(X_train, y_train)

            # Calculate LIME importances for a sample of instances
            n_samples = min(200, X_train.shape[0])  # Adjust based on your data
            sample_instances = X_train[np.random.choice(X_train.shape[0], n_samples, replace=False)]

            for instance in sample_instances:
                exp = explainer.explain_instance(instance, clf.predict_proba, num_features=X_train.shape[1])
                importances.append(dict(exp.as_list()))

            importance_df = pd.DataFrame(importances)

            # Get top 10 important features
            top_features = importance_df.mean().sort_values(ascending=False).head(10).index
            important_features[name].append(top_features)

    # ...


    # Find and print common important features for each classifier
    for name, features in important_features.items():
        common_features = set(features[0]).intersection(*features)
        print(f"Common important features for {name} in {cancer_types[i]} datasets:")
        print(common_features)

Processing liver_cancer datasets
Common important features for Logistic Regression in liver_cancer datasets:
{'PC88 <= -0.21', 'PC5 > 13.95', 'PC0 <= -41.67', 'PC0 <= -27.53', 'PC1 > 11.97', '-41.67 < PC0 <= -14.72', 'PC86 <= -1.31', 'PC2 > 14.04'}
Common important features for Random Forest in liver_cancer datasets:
{'PC88 <= -0.21', 'PC5 > 13.95', 'PC0 <= -41.67', 'PC0 <= -27.53', 'PC1 > 11.97', '-41.67 < PC0 <= -14.72', 'PC87 > 0.50', 'PC2 > 14.04', 'PC86 <= -1.31'}


In [200]:
# Additional storage for metrics and predictions
metrics = {
    "Logistic Regression": {},
    "Random Forest": {},
}

# Loop through classifiers
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)

    # Predictions
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)

    # Accuracy
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    # Confusion matrices
    train_conf_matrix = confusion_matrix(y_train, y_train_pred)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)

    # Store metrics and predictions
    metrics[name]['train_accuracy'] = train_acc
    metrics[name]['test_accuracy'] = test_acc
    metrics[name]['train_confusion_matrix'] = train_conf_matrix
    metrics[name]['test_confusion_matrix'] = test_conf_matrix
    metrics[name]['train_predictions'] = y_train_pred
    metrics[name]['test_predictions'] = y_test_pred

    # Feature importances for Random Forest
    if name == "Random Forest":
        metrics[name]['feature_importances'] = clf.feature_importances_

# Print out metrics and predictions
for name, metric in metrics.items():
    print(f"Classifier: {name}")
    print(f"Train Accuracy: {metric['train_accuracy']}")
    print(f"Test Accuracy: {metric['test_accuracy']}")
    print(f"Train Confusion Matrix: \n{metric['train_confusion_matrix']}")
    print(f"Test Confusion Matrix: \n{metric['test_confusion_matrix']}")
    if name == "Random Forest":
        print(f"Feature Importances: {metric['feature_importances']}")
    print("\n")

Classifier: Logistic Regression
Train Accuracy: 0.9047619047619048
Test Accuracy: 0.8571428571428571
Train Confusion Matrix: 
[[55  2]
 [ 4  2]]
Test Confusion Matrix: 
[[22  2]
 [ 2  2]]


Classifier: Random Forest
Train Accuracy: 1.0
Test Accuracy: 0.9642857142857143
Train Confusion Matrix: 
[[57  0]
 [ 0  6]]
Test Confusion Matrix: 
[[23  1]
 [ 0  4]]
Feature Importances: [0.41893757 0.58106243]




In [175]:
# Create a dictionary to hold the classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=5000),
    "Random Forest": RandomForestClassifier(),
}

# Number of features to select
num_features_to_select = 50  # Adjust based on your preference

# Loop through each cancer type
for i in range(len(cancer_types)):

    print(f"Processing {cancer_types[i]} datasets")

    # Create list to store datasets
    datasets = []

    # Process datasets for current cancer type
    for filename in os.listdir(cancer_folders[i]):
        if filename.endswith('.csv'):
            dataset_path = os.path.join(cancer_folders[i], filename)
            df = pd.read_csv(dataset_path)
            datasets.append(df)

    # Store important features for each classifier
    important_features = {
        "Logistic Regression": [],
        "Random Forest": [],
    }

    # Loop through each dataset
    for df in datasets:

        # Remove the first column as it's not a feature
        df = df.drop(df.columns[0], axis=1)

        # Label encoding for the target classes
        le = LabelEncoder()
        df['type'] = le.fit_transform(df['type'])

        # Splitting into features (X) and target (y)
        X = df.drop('type', axis=1)
        y = df['type']

        # Splitting the dataset into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

        # Loop through classifiers
        for name, clf in classifiers.items():
            # Fit the classifier
            clf.fit(X_train, y_train)

            # Apply RFE for feature selection
            selector = RFE(clf, n_features_to_select=num_features_to_select, step=1)
            selector = selector.fit(X_train, y_train)

            # Get the most important features
            important_features_mask = selector.support_
            important_features[name] = [feature for feature, selected in zip(X_train.columns, important_features_mask) if selected]

    # Find and print common important features for each classifier
    for name, features in important_features.items():
        print(f"Important features for {name} in {cancer_types[i]} datasets:")
        print(features)

Processing liver_cancer datasets


KeyboardInterrupt: 