In [None]:
import numpy as np
import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.inspection import plot_partial_dependence
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support

In [None]:
def import_csv(file):
    df = pd.read_csv(file)
    print(df.head())
    print(df.shape)
    print("")
    print("Percentage of nans:")
    print(df.isna().mean().round(4) * 100)
    return df

In [None]:
def pca_elbow_plot(x):
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)
    pca = PCA()
    pca.fit(x_scaled)
    plt.figure()
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Variance (%)')  # for each component
    plt.title('Explained Variance')
    plt.show()
    return pca

In [None]:
def pca_biplot(score, coeff, y, labels=None):
    xs = score[:, 0]
    ys = score[:, 1]
    n = coeff.shape[0]
    scalex = 1.0 / (xs.max() - xs.min())
    scaley = 1.0 / (ys.max() - ys.min())
    plt.scatter(xs * scalex, ys * scaley, c=y)
    for i in range(n):
        plt.arrow(0, 0, coeff[i, 0], coeff[i, 1], color='r', alpha=0.5)
        if labels is None:
            plt.text(coeff[i, 0] * 1.15, coeff[i, 1] * 1.15, "Var" + str(i + 1), color='g', ha='center', va='center')
        else:
            plt.text(coeff[i, 0] * 1.15, coeff[i, 1] * 1.15, labels[i], color='g', ha='center', va='center')
    plt.xlim(-1, 1)
    plt.ylim(-1, 1)
    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))
    plt.grid()
    plt.show()

In [None]:
def check_class_imba(y):
    plt.hist(y)
    plt.show()

In [None]:
def rf_feature_ranking(x, y, labels=None, imbalanced=False):
    parameters = {
        'n_estimators': [10],
        'max_depth': [1, 7],
        'max_features': [0.8, None]
    }
    if imbalanced:
        rf = BalancedRandomForestClassifier(criterion='entropy', replacement=True)
    else:
        rf = RandomForestClassifier(criterion='entropy')
    rf_cv = GridSearchCV(rf, parameters, cv=5, iid=False, return_train_score=True, refit=True)
    rf_cv.fit(x, y)
    best_rf = rf_cv.best_estimator_
    if labels is None:
        labels = np.arange(x.shape[0])
    order = np.arange(x.shape[0])
    ranked_features = [n for _, n, _ in sorted(zip(best_rf.feature_importances_, labels, order), reverse=True)]
    ordered_indices = [n for _, _, n in sorted(zip(best_rf.feature_importances_, labels, order), reverse=True)]
    return ranked_features, ordered_indices, best_rf

In [None]:
def pdp(est, x, feature, feature_names, no, target):
    plot_partial_dependence(est, x, feature[:no], feature_names, target=target)
    fig = plt.gcf()
    fig.suptitle('Partial dependence')
    plt.subplots_adjust(top=0.9)
    plt.show()

In [None]:
def pca_lr(x, y, imbalanced=False):
    lr = LogisticRegression(solver='lbfgs', multi_class='auto')
    pca = PCA()
    if imbalanced:
        pca_lr_parameters = {
            'bagged_lr__n_estimators': [10],
            'pca__n_components': [0.8],
            'bagged_lr__base_estimator__C': [2 ** -7, 2 ** 7]
        }
        bagged_lr = BalancedBaggingClassifier(base_estimator=lr, replacement=True)
        pca_lr_pipeline = Pipeline([('scale', StandardScaler()), ('pca', pca), ('bagged_lr', bagged_lr)])
    else:
        pca_lr_parameters = {
            'pca__n_components': [0.8],
            'lr__C': [2 ** -7, 2 ** 7]
        }
        pca_lr_pipeline = Pipeline([('scale', StandardScaler()), ('pca', pca), ('lr', lr)])
    pca_lr_cv = GridSearchCV(pca_lr_pipeline, pca_lr_parameters, cv=5, iid=True, return_train_score=True)
    pca_lr_cv.fit(x, y)
    cv_results_df = pd.DataFrame(pca_lr_cv.cv_results_).sort_values(by='mean_test_score', ascending=False)
    return pca_lr_cv, cv_results_df

In [None]:
def pca_rf(x, y, imbalanced=False):
    parameters = {
        'pca__n_components': [None, 0.8],
        'rf__n_estimators': [10],
        'rf__max_depth': [1, 7],
        'rf__max_features': [None]
    }
    pca = PCA()
    if imbalanced:
        rf = BalancedRandomForestClassifier(criterion='entropy', replacement=True)
    else:
        rf = RandomForestClassifier(criterion='entropy')
    pca_rf_pipeline = Pipeline([('scale', StandardScaler()), ('pca', pca), ('rf', rf)])
    rf_cv = GridSearchCV(pca_rf_pipeline, parameters, cv=5, iid=False, return_train_score=True, refit=True)
    rf_cv.fit(x, y)
    cv_results_df = pd.DataFrame(rf_cv.cv_results_).sort_values(by='mean_test_score', ascending=False)
    return rf_cv, cv_results_df

In [None]:
def pca_rbf_svm(x, y, imbalanced=False):
    svm = SVC()
    pca = PCA()
    if imbalanced:
        pca_svm_parameters = {
            'bagged_svm__n_estimators': [10],
            'pca__n_components': [0.8],
            'bagged_svm__base_estimator__C': [2 ** -7, 2 ** 7],
            'bagged_svm__base_estimator__gamma': [ 2 ** 7]
        }
        bagged_svm = BalancedBaggingClassifier(base_estimator=svm, replacement=True)
        pca_svm_pipeline = Pipeline([('scale', StandardScaler()), ('pca', pca), ('bagged_svm', bagged_svm)])
    else:
        pca_svm_parameters = {
            'pca__n_components': [0.8],
            'svm__C': [2 ** -7, 2 ** 7],
            'svm__gamma': [2 ** -7, 2 ** 7]
        }
        pca_svm_pipeline = Pipeline([('scale', StandardScaler()), ('pca', pca), ('svm', svm)])
    pca_svm_cv = GridSearchCV(pca_svm_pipeline, pca_svm_parameters, cv=5, iid=True, return_train_score=True)
    pca_svm_cv.fit(x, y)
    cv_results_df = pd.DataFrame(pca_svm_cv.cv_results_).sort_values(by='mean_test_score', ascending=False)
    return pca_svm_cv, cv_results_df

In [None]:
def cal_test_results(_x_train, _x_test, _y_train, _y_test, imbalanced=False):
    pca_lr_grcv, pca_lr_results = pca_lr(_x_train, _y_train, imbalanced=imbalanced)
    pca_rbf_svm_grcv, pca_rbf_svm_results = pca_rbf_svm(_x_train, _y_train, imbalanced=imbalanced)
    pca_rf_grcv, pca_rf_results = pca_rf(_x_train, _y_train, imbalanced=imbalanced)
    result_index = ['LR', 'RBF-SVM', 'RF']
    result_column = ['Precision', 'Recall', 'F1']
    test_results = np.zeros((3, 3))
    test_results[0, :] = precision_recall_fscore_support(_y_test, pca_lr_grcv.predict(_x_test), average='weighted')[:3]
    test_results[1, :] = precision_recall_fscore_support(_y_test, pca_rbf_svm_grcv.predict(_x_test), average='weighted')[:3]
    test_results[2, :] = precision_recall_fscore_support(_y_test, pca_rf_grcv.predict(_x_test), average='weighted')[:3]
    results_df = pd.DataFrame(test_results, result_index, result_column)
    return [pca_lr_results, pca_rbf_svm_results, pca_rf_results], results_df

In [None]:
np.random.seed(42)
csv_file = 'cancer_df.csv'
dataset = import_csv(csv_file)

In [None]:
dataset.loc[[1, 2, 3, 4]] = dataset.loc[[1, 2, 3, 4]].replace(0, np.nan)

In [None]:
dataset[dataset < 0] = np.nan

In [None]:
dataset_values = dataset.values
x_all = dataset_values[:, :-1]
y_all = dataset_values[:, -1]
features = list(dataset)[:-1]

In [None]:
pca_model = pca_elbow_plot(x_all)

In [None]:
x_scaled = StandardScaler().fit_transform(x_all)
x_all_pca = pca_model.transform(x_scaled)

In [None]:
pca_biplot(x_all_pca[:, 0:2], np.transpose(pca_model.components_[0:2, :]), y_all, labels=features)

In [None]:
check_class_imba(y_all)
IMBALANCED = True

In [None]:
feature_ranked, indices_ranked, ranked_model = rf_feature_ranking(x_all, y_all, features, imbalanced=True)

In [None]:
feature_ranked

In [None]:
pdp(ranked_model, x_all, indices_ranked, feature_ranked, 4, target=1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2)

In [None]:
cv_result_list, test_results_df = cal_test_results(x_train, x_test, y_train, y_test, IMBALANCED)

In [None]:
print(test_results_df)