In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import optuna
import optuna.visualization as ov
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv(r'C:\Users\aksha\Downloads\data.csv(1)\data.csv')
labels = pd.read_csv(r'C:\Users\aksha\Downloads\labels.csv')

In [4]:
data['labels'] = labels['Class']
print(data['labels'].value_counts())
tsne_data = data.drop(columns=['Unnamed: 0', 'labels'], axis=0)

BRCA    300
KIRC    146
LUAD    141
PRAD    136
COAD     78
Name: labels, dtype: int64


In [6]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3)
tsne_res = tsne.fit_transform(tsne_data)
tsne_res = pd.DataFrame(tsne_res)
tsne_res['labels'] = data['labels']

**DASHBOARD 1**

In [7]:
def get_feature_importance(model, data):
    X = data.drop(columns=['Unnamed: 0', 'labels'], axis=1)
    y = data['labels']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create an empty DataFrame to store feature importances for each class
    all_feature_importances = pd.DataFrame(columns=['Feature', 'Importance', 'Target'])

    # Loop through each class and extract top 5 features
    for cancer_type in y.unique():
        # Fit the model for the current class
        model.fit(X_train[y_train == cancer_type], y_train[y_train == cancer_type])

        # Get feature importances
        feature_importances = model.feature_importances_

        # Create a DataFrame with feature names and their importances
        feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances, 'Target': cancer_type})

        # Sort the DataFrame by importance in descending order
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

        # Extract top 5 features for the current class
        top_features_for_cancer_type = feature_importance_df.head(10)

        # Append the results to the overall DataFrame
        all_feature_importances = pd.concat([all_feature_importances, top_features_for_cancer_type])

    # Print or inspect the top features for each class
    #print(all_feature_importances)
    top_features_by_cancer_type = all_feature_importances.groupby('Target')['Feature'].apply(lambda x: ', '.join(x.head(10))).reset_index()
    #print(all_feature_importances)
    # Print or use the resulting DataFrame
    outputs = [
        top_features_by_cancer_type.loc[0, 'Target'] + ' : ' + top_features_by_cancer_type.loc[0, 'Feature'],
        top_features_by_cancer_type.loc[1, 'Target'] + ' : ' + top_features_by_cancer_type.loc[1, 'Feature'],
        top_features_by_cancer_type.loc[2, 'Target'] + ' : ' + top_features_by_cancer_type.loc[2, 'Feature'],
        top_features_by_cancer_type.loc[3, 'Target'] + ' : ' + top_features_by_cancer_type.loc[3, 'Feature'],
        top_features_by_cancer_type.loc[4, 'Target'] + ' : ' + top_features_by_cancer_type.loc[4, 'Feature']
    ]
    return outputs

In [9]:
import dash
from dash import html, dcc, callback, Input, Output
import dash_bootstrap_components as dbc
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
from umap.umap_ import UMAP
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestClassifier

# Assuming 'data' is your dataset

scaler = Normalizer()
reduction_data = data.drop(columns=['Unnamed: 0', 'labels'], axis=1)
columns = reduction_data.columns
transformed = scaler.fit_transform(reduction_data)
reduction_data = pd.DataFrame(transformed, columns=columns)

app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP, dbc.icons.BOOTSTRAP])

# Dashboard Layout
app.layout = dbc.Container([
    dbc.Row([
        dbc.Col(html.H1("Dashboard 1"), width={'size': 6, 'offset': 3})
    ]),

    dbc.Row([
        dbc.Col(
            dcc.Graph(id='target_histogram'),
            width={'size': 6, 'offset': 0}  # Adjust the width of the histogram column
        ),
        dbc.Col([
            dcc.Dropdown(
                options=[
                    {'label': 'BRCA', 'value': 'BRCA'},
                    {'label': 'COAD', 'value': 'COAD'},
                    {'label': 'LUAD', 'value': 'LUAD'},
                    {'label': 'PRAD', 'value': 'PRAD'},
                    {'label': 'KIRC', 'value': 'KIRC'}
                ],
                value='BRCA',
                id='label_type'
            ),
            html.Div(id='imp_feat', style={'width': '100%', 'overflowWrap': 'break-word'})
        ], width={'size': 6, 'offset': 0})  # Adjust the width of the dropdown + imp_feat column
    ]),
    dbc.Row([
        dbc.Col(dcc.Dropdown(
            options=[
                {'label': 'PCA', 'value': 'PCA'},
                {'label': 'LDA', 'value': 'LDA'},
                {'label': 't-SNE', 'value': 'TSNE'},
                {'label': 'UMAP', 'value': 'UMAP'}
            ],
            value='PCA',
            id='method'
        ))
    ]),
    dbc.Row([
        dbc.Col(dcc.Graph(id='feature_graph'))
    ])
])
fig = None

@app.callback(
    Output('feature_graph', 'figure'),
    Input('method', 'value')
)
def update_visualization(method):

    if method == 'PCA':
        pca = PCA(n_components=2)
        pca_res = pca.fit_transform(reduction_data)
        pca_res = pd.DataFrame(pca_res, columns=['Component 1', 'Component 2'])
        pca_res['labels'] = data['labels']
        
        fig = px.scatter(pca_res, x='Component 1', y='Component 2', color='labels', 
                         title='PCA Visualization')
        
        return fig
    
    elif method == 'LDA':
        lda = LDA(n_components=2)
        lda_res = lda.fit_transform(reduction_data, data['labels'])
        lda_res = pd.DataFrame(lda_res, columns=['Component 1', 'Component 2'])
        lda_res['labels'] = data['labels']
        
        fig = px.scatter(lda_res, x='Component 1', y='Component 2', color='labels', 
                         title='LDA Visualization')
        
        return fig
    
    elif method == 'TSNE':
        tsne = TSNE(n_components=2)
        tsne_res = tsne.fit_transform(reduction_data)
        tsne_res = pd.DataFrame(tsne_res, columns=['Component 1', 'Component 2'])
        tsne_res['labels'] = data['labels']
        
        fig = px.scatter(tsne_res, x='Component 1', y='Component 2', color='labels', 
                         title='t-SNE Visualization')
        
        return fig

    elif method == 'UMAP':
        umap = UMAP(n_components=2)
        umap_res = umap.fit_transform(reduction_data)
        umap_res = pd.DataFrame(umap_res, columns=['Component 1', 'Component 2'])
        umap_res['labels'] = data['labels']
        
        fig = px.scatter(umap_res, x='Component 1', y='Component 2', color='labels', 
                         title='UMAP Visualization')
        return fig
    

@app.callback(
    Output('imp_feat', 'children'),
    Input('label_type', 'value')
)
def update_important_features(label_type):
    rf = RandomForestClassifier()  # Initialize the model
    top_features_by_cancer_type = get_feature_importance(rf, data)  # Get feature importance for all labels
    
    # Find the top features for the selected label type
    selected_label_features = [entry for entry in top_features_by_cancer_type if label_type in entry]
    
    # If the selected label type is found in the results
    if selected_label_features:
        selected_label_features = selected_label_features[0]  # Take the first occurrence
        label, features = selected_label_features.split(' : ')
        features_list = features.split(', ')  # Split the features by comma
        
        # Create a numbered list of features
        numbered_features = [f'{i+1}. {feature}' for i, feature in enumerate(features_list)]
        formatted_features = '\n'.join(numbered_features)  # Join the list with line breaks
        
        return html.Div([
            html.Strong(f'Genes important for identifying {label}:'),
            dcc.Markdown(formatted_features)
        ])
    else:
        return html.Div('Label type not found or no feature importance available for this type.')

@app.callback(
    Output('target_histogram', 'figure'),
    Input('method', 'value')
)
def update_target_histogram(method):
    # Assuming 'target' is your target variable
    fig = px.histogram(data, x='labels', title='Histogram of Target Variable')
    fig.update_layout(
        autosize=False,
        width=500,  # Adjust the width as needed
        height=400  # Adjust the height as needed
    )
    return fig

if __name__ == '__main__':
    app.run_server(port=6551,debug=True,jupyter_mode="external")

Dash app running on http://127.0.0.1:6551/


In [12]:
#######################################################################Dashboard 1 ENDS here #################################################################################################################

### Dashboard 2

In [10]:
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

import plotly.graph_objects as go
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc

In [11]:
# we fit a given model to data using this function
def fit_model(model):
    model.fit(X_train, y_train)
    return model


# this function will generate classification reports for a given model
def get_classification_report(model):
    model = fit_model(model)
    preds = model.predict(X_test)
    
    # save the classification report as a dict, this gives flexibility to read each indivisual metrics
    cf = classification_report(y_test, preds, output_dict=True, zero_division=0.0)

    # extract f1 scores
    f1_scores = [cf[str(label)]['f1-score'] for label in np.unique(y_test)]

    # Create a Plotly bar chart
    fig = go.Figure(go.Bar(
        x=np.unique(y_test),
        y=f1_scores,
        text=f1_scores,
        textposition='auto',
        marker=dict(color='blue'),
    ))

    # Update layout
    fig.update_layout(
        xaxis=dict(title='Class'),
        yaxis=dict(title='Precision (F1 Score)'),
        title='Precision (F1 Score) for Each Class',
    )

    return fig


# we use one versus many method for generating ROC for multiclass classsification
def get_roc(model):
    # Binarize the labels for each class
    y_train_bin = label_binarize(y_train, classes=np.unique(y_train))
    y_test_bin = label_binarize(y_test, classes=np.unique(y_test))

    # Train your model (replace with your own model)
    model = OneVsRestClassifier(model)  # Replace with your model
    model.fit(X_train, y_train_bin)

    # Get predicted probabilities on the test set
    y_scores = model.predict_proba(X_test)

    # Compute ROC curve and AUC for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(len(np.unique(y_train))):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_scores[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Create a Plotly figure
    fig = go.Figure()

    # Plot the ROC curve for each class
    for i in range(len(np.unique(y_train))):
        fig.add_trace(go.Scatter(x=fpr[i], y=tpr[i], mode='lines',
                                 name=f'Class {i} (AUC = {roc_auc[i]:.2f})'))

    # Plot the diagonal line
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Diagonal',
                             line=dict(dash='dash', color='navy')))

    # Update layout
    fig.update_layout(
        xaxis=dict(title='False Positive Rate'),
        yaxis=dict(title='True Positive Rate'),
        title='Receiver Operating Characteristic (ROC) Curve for Multi-class (One-vs-All)',
        legend=dict(x=0, y=1, traceorder='normal'))

    return fig

In [12]:
def get_cf_roc(model_name):
    cf = None
    roc_fig = None
    # we define classification models based on model name accepted from the dropdown
    
    if model_name == 'Logistic Regreession':
        model = LogisticRegression()
        model = fit_model(model)
        
        # get ROC figure
        roc_fig = get_roc(model)
        
        # get classification report figure
        cf = get_classification_report(model)
        
    if model_name == 'Random Forest Classifier':
        model = RandomForestClassifier()
        model = fit_model(model)
        
        # get ROC figure
        roc_fig = get_roc(model)
        
        # get classification report figure
        cf = get_classification_report(model)
        
    if model_name == 'XGB Classifier':
        model = XGBClassifier(num_class=5)
        model = fit_model(model)
        
        # get ROC figure
        roc_fig = get_roc(model)
        
        # get classification report figure
        cf = get_classification_report(model)
        
    if model_name == 'Bagging Classifier':
        model = BaggingClassifier()
        model = fit_model(model)
        
        # get ROC figure
        roc_fig = get_roc(model)
        
        # get classification report figure
        cf = get_classification_report(model)
        
    if model_name == 'ADA Boost Classfier':
        model = AdaBoostClassifier()
        model = fit_model(model)
        
        # get ROC figure
        roc_fig = get_roc(model)
        
        # get classification report figure
        cf = get_classification_report(model)
        
    if model_name == 'Decision Tree Classifier':
        model = DecisionTreeClassifier()
        model = fit_model(model)
        
        # get ROC figure
        roc_fig = get_roc(model)
        
        # get classification report figure
        cf = get_classification_report(model)
        
    return cf, roc_fig

In [48]:
data.drop('Unnamed: 0', axis = 1, inplace= True)
labels.drop('Unnamed: 0', axis = 1, inplace= True)
X = data
y = labels 
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
models = {
    'LogisticRegression': {
        'C': (1e-5, 1e5),
    },
    'AdaBoostClassifier': {
        'n_estimators': (50, 200),
        'learning_rate': (0.01, 1.0),
    },
    'BaggingClassifier': {
        'n_estimators': (10, 100),
    },
    'RandomForestClassifier': {  # Corrected class name
        'n_estimators': (10, 200),
        'max_depth': (2, 32),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
    },
    'DecisionTreeClassifier': {
        'max_depth': (2, 32),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
    },
    'XGBClassifier': {
        'learning_rate': (0.01, 1.0),
        'n_estimators': (50, 200),
        'max_depth': (2, 32),
        'min_child_weight': (1e-5, 1e5),
        'subsample': (0.1, 1.0),
        'colsample_bytree': (0.1, 1.0),
    },
}


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [57]:
from dash import Dash

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
pca = PCA(n_components=2)

scaler = Normalizer()
reduction_data = data.drop(columns=['labels'], axis=1)
columns = reduction_data.columns
transformed = scaler.fit_transform(reduction_data)
reduction_data = pd.DataFrame(transformed, columns=columns)

pca_res = pca.fit_transform(reduction_data)
pca_res = pd.DataFrame(pca_res)
pca_res['labels'] = data['labels']

X = pca_res.drop(columns=['labels'], axis=1)
y = data['labels']
y = enc.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

app = Dash(__name__)
app.layout = html.Div([
    html.Div([
        dcc.Dropdown(
            options=['PCA', 'UMAP', 'TSNE', 'LDA'],
            value='PCA',
            id='method'
        ),
        dcc.Dropdown(
            options=['Logistic Regression', 'Random Forest Classifier', 'XGB Classifier', 'Bagging Classifier', 'ADA Boost Classfier', 'Decision Tree Classifier'],
            value='Random Forest Classifier',
            id='classification_model'
        ),
        html.Div([
            dcc.Graph(id='classification_report'),
            dcc.Graph(id='roc_curve')
        ], style={'display': 'flex'}),
    ]),
    html.Div([
        dcc.Dropdown(
            id='model-dropdown',
            options=[{'label': model_name, 'value': model_name} for model_name in models.keys()],
            value='RandomForestClassifier',  # Initial model
            style={'width': '70%'}
        ),
        dcc.Graph(id='parallel-coordinate-plot'),
    ])
])

@app.callback(
    [Output('classification_report', 'figure'), Output('roc_curve', 'figure')], 
    [Input('method', 'value'), Input('classification_model', 'value')]
)

def get_model_details(method, model_name):
    if method == 'PCA':
        pca = PCA()
        pca.fit(reduction_data)
        cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

        # Determine the number of components to retain
        n_components = np.argmax(cumulative_variance >= 0.995) + 1
        #print('pca : ', n_components)
        pca = PCA(n_components=n_components)
        pca_res = pca.fit_transform(reduction_data)
        pca_res = pd.DataFrame(pca_res)
        pca_res['labels'] = data['labels']

        X = pca_res.drop(columns=['labels'], axis=1)
        y = pca_res['labels']
        y = enc.fit_transform(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    if method == 'UMAP':
        pca = PCA()
        pca.fit(reduction_data)
        cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
        # Determine the number of components to retain
        n_components = np.argmax(cumulative_variance >= 0.995) + 1
        #print('umap : ', n_components)
        umap = UMAP(n_components=n_components)
        umap_res = umap.fit_transform(reduction_data)
        umap_res = pd.DataFrame(umap_res)
        umap_res['labels'] = data['labels']

        X = umap_res.drop(columns=['labels'], axis=1)
        y = umap_res['labels']
        y = enc.fit_transform(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    if method == 'TSNE':
        pca = PCA()
        pca.fit(reduction_data)
        cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
        # Determine the number of components to retain
        n_components = np.argmax(cumulative_variance >= 0.995) + 1
        tsne = TSNE(n_components=2)
        tsne_res = tsne.fit_transform(reduction_data)
        tsne_res = pd.DataFrame(tsne_res)
        tsne_res['labels'] = data['labels']

        X = tsne_res.drop(columns=['labels'], axis=1)
        y = tsne_res['labels']
        y = enc.fit_transform(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    if method == 'LDA':
        lda = LDA(n_components=2)
        lda_res = lda.fit_transform(reduction_data, data['labels'])
        lda_res = pd.DataFrame(lda_res)
        lda_res['labels'] = data['labels']

        X = lda_res.drop(columns=['labels'], axis=1)
        y = lda_res['labels']
        y = enc.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    cf, roc = get_cf_roc(model_name)
    return cf, roc

study_results = {}

def update_parallel_coordinate_plot(selected_model):
    print(f"Selected Model: {selected_model}")
    print(f"Available Models: {models.keys()}")

for model_name, hyperparameters in models.items():
    def objective(trial):
        params = {}
        for param_name, param_range in hyperparameters.items():
            if param_name in ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf']:
                params[param_name] = trial.suggest_int(param_name, *param_range)
            else:
                params[param_name] = trial.suggest_float(param_name, *param_range)

        model = globals()[model_name](**params)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        f1 = f1_score(y_test, predictions, average='weighted')
        return f1

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)
    study_results[model_name] = study

@app.callback(
    Output('parallel-coordinate-plot', 'figure'),
    [Input('model-dropdown', 'value')]
)

def update_parallel_coordinate_plot(selected_model):
    print(f"Selected Model: {selected_model}")
    study = study_results[selected_model]
    fig = ov.plot_parallel_coordinate(study)
    fig.update_layout(title=f'Optimization Results for {selected_model}', autosize=False)
    return fig

app.run(port=9989, debug=True, jupyter_mode='external', allow_duplicate=True)

[I 2023-12-17 14:16:26,911] A new study created in memory with name: no-name-fc452996-6bbe-4ce9-833e-50113744d39f

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

[I 2023-12-17 14:16:27,205] Trial 0 finished with value: 0.9516142184662145 and parameters: {'C': 23821.352369711327}. Best is trial 0 with value: 0.9516142184662145.

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/line

Dash app running on http://127.0.0.1:9989/


Selected Model: RandomForestClassifier
Selected Model: RandomForestClassifier
Selected Model: RandomForestClassifier
Selected Model: XGBClassifier
Selected Model: RandomForestClassifier


In [16]:
################################################################################################################
# dashboard 2 conde ends here