In [1]:
import pyreadr
import math
import numpy as np
import pandas as pd
from pymatreader import read_mat
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib
import matplotlib.pyplot as plt
import time

In [2]:
def recursionStart(spectra):
    featurePairs = []
    maxima = []
    minima = []
    
    for i in range(1, len(spectra) - 1):
        if spectra[i] > spectra[i-1]:
            if spectra[i] >= spectra[i+1]:
                for j in range(i+1, len(spectra)):
                    if spectra[i] > spectra[j]:
                        maxima.append([i, spectra[i]])
                        break
                    elif spectra[i] < spectra[j]:
                        break
        if spectra[i] < spectra[i-1]:
            if spectra[i] <= spectra[i+1]:
                for j in range(i+1, len(spectra)):
                    if spectra[i] < spectra[j]:
                        minima.append([i, spectra[i]])
                        break
                    elif spectra[i] > spectra[j]:
                        break
    maxima.sort(key = lambda element: element[1], reverse = True)
    minima.sort(key = lambda element: element[1])
    
    globalMaxima = maxima.pop(0)
    featurePairs.append([globalMaxima[0], globalMaxima[1] - minima[0][1]])
    
    recursionStep(0, globalMaxima[0], maxima.copy(), minima.copy(), featurePairs)
    recursionStep(len(spectra) - 1, globalMaxima[0], maxima.copy(), minima.copy(), featurePairs)
    
    return featurePairs

def recursionStep(start, end, maxima, minima, featurePairs):
    factor = 1
    if end < start:
        factor = -1
    currentMaxima = []
    for i in range(len(maxima)):
        position = maxima[i][0]
        if start * factor < position * factor and position * factor < end * factor:
            currentMaxima.append(maxima[i])
    if len(currentMaxima) == 0:
        return
    localMaxima = currentMaxima.pop(0)
    recursionStep(start, localMaxima[0], currentMaxima.copy(), minima.copy(), featurePairs)
    currentMinima = []
    for i in range(len(minima)):
        position = minima[i][0]
        if localMaxima[0] * factor < position * factor and position * factor < end * factor:
            currentMinima.append(minima[i])
    localMinima = currentMinima.pop(0)
    featurePairs.append([localMaxima[0], localMaxima[1] - localMinima[1]])
    recursionStep(localMinima[0], localMaxima[0], currentMaxima.copy(), currentMinima.copy(), featurePairs)
    recursionStep(localMinima[0], end, currentMaxima.copy(), currentMinima.copy(), featurePairs)

In [3]:
def getPersistenceTransformation(data_X, listOfK):
    spectras = []
    for i in range(len(data_X)):
        featurePairs = recursionStart(data_X.iloc[i])
        featurePairs.sort(key = lambda element: element[1], reverse = True)
        spectras.append(featurePairs)
        
    spectrasForEachK = []
    
    for k in listOfK:
        transformationForSpectra = []
        for i in range(0, len(spectras)):
            transformation = [0] * len(data_X.iloc[i])
            featurePairs = spectras[i][0:round(k*len(spectras[i]))]
            for x, fx in featurePairs:
                transformation[x] = fx
            transformationForSpectra.append(transformation)
        spectrasForEachK.append(transformationForSpectra)
    return spectrasForEachK

In [4]:
def cvAccuracy(tmas, X, resp, classifier, ntrees = 150):
    
    tmas_names = ['TMA_1:', 'TMA_2:', 'TMA_3:', 'TMA_4:', 'TMA_5:', 'TMA_6:', 'TMA_7:', 'TMA_8:']
    
    tmas_res_test   = []
    tmas_vals_test  = []
    tmas_res_train  = []
    tmas_vals_train = []
    #WE fix this params in order to prevent from overfitting
    
    mtry =  round(math.sqrt(p))
    for tma in range(1, len(tmas_names)+1):
    
        X_train = X.iloc[tmas[tmas != tma].index, :]

        X_test = X.iloc[tmas[tmas == tma].index, :]

        y_train = resp.iloc[tmas[tmas != tma].index, :]

        y_test = resp.iloc[tmas[tmas == tma].index, :]
        
        if (classifier == 'logit'):

            np.random.seed(1234)
            
            logreg = LogisticRegression(penalty = None, solver = 'newton-cg',fit_intercept=True, random_state = 1234)

            logreg.fit(X_train, y_train.values.ravel())

            y_pred = logreg.predict_proba(X_test)

            y_pred_1 = np.where(y_pred[:, 1] > 0.5, 1, 0)

            accuracy = balanced_accuracy_score(y_test, y_pred_1)
            
            tmas_vals_test.append(accuracy)

            tmp_tmas = [tmas_names[tma-1], accuracy]

            tmas_res_test.append(tmp_tmas)
            
        if (classifier == 'rf'):
            
            rf = RandomForestClassifier(n_estimators= ntrees, random_state= 1234, criterion = 'gini',
                                        n_jobs = 8, max_features = 41 )
            
            rf.fit(X_train, y_train.values.ravel())
            
            y_pred_rf = rf.predict(X_test)
            
            accuracy = balanced_accuracy_score(y_test, y_pred_rf)
            
            tmas_vals_test.append(accuracy)

            tmp_tmas = [tmas_names[tma-1], accuracy]

            tmas_res_test.append(tmp_tmas)
        
    return tmas_vals_test, tmas_res_test  

def Average(lst):
    return sum(lst) / len(lst)   

In [5]:
# Loading Data from Matlab
dataMaldi = read_mat('data/L1-8_tic_ad_sq.mat')

X_values = pd.DataFrame(dataMaldi['data_tic'])#Reading the mz-values 
                                #to get pd's as input for the ml
classes = pd.DataFrame(dataMaldi['classes'])#The cancer Typ, result for ml

mz_values = pd.DataFrame(dataMaldi['mzVector'])

TMAs   = pd.Series(dataMaldi['tmas'])

p = X_values.shape[1]

y = pd.DataFrame(np.where(classes == 1, 0, 1))

In [6]:
# We experiment over a grid of levels of peaks extraction. Namely, 0.01, 0.05, 0.1., 0.2, 0.3, 0.4, 0.5
listOfk = [0.01, 0.05, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5]
st = time.time()
ListOfXk = getPersistenceTransformation(data_X = X_values, listOfK = listOfk)
et = time.time()
print('Time for the processing: ', et-st)
X_k0 = ListOfXk[0]
X_k1 = ListOfXk[1]
X_k2 = ListOfXk[2]
X_k3 = ListOfXk[3]
X_k4 = ListOfXk[4]
X_k5 = ListOfXk[5]
X_k6 = ListOfXk[6]
X_k7 = ListOfXk[7]

Time for the processing:  248.56335759162903


In [7]:
rf_k5_1000, rf_k5_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k5), classifier = 'rf', ntrees = 1000)
pd.DataFrame(rf_k5_1000).describe()

Unnamed: 0,0
count,8.0
mean,0.878371
std,0.067213
min,0.774464
25%,0.850903
50%,0.892345
75%,0.925307
max,0.958738


In [None]:
#st0 = time.time()
#rf_k0_1000, rf_k0_2_1000= cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k0), classifier = 'rf', ntrees = 1000)
#et0 = time.time()
#print('Time for processing ', listOfk[0], ': ', et0 - st0)
#st1 = time.time()
#rf_k1_1000, rf_k1_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k1), classifier = 'rf', ntrees = 1000)
#et1 = time.time()
#print('Time for processing ', listOfk[1], ': ', et1 - st1)
st2 = time.time()
rf_k2_1000, rf_k2_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k2), classifier = 'rf', ntrees = 1000)
et2 = time.time()
print('Time for processing ', listOfk[2], ': ', et2 - st2)
st3 = time.time()
rf_k3_1000, rf_k3_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k3), classifier = 'rf', ntrees = 1000)
et3 = time.time()
print('Time for processing ', listOfk[3], ': ', et3 - st3)
st4 = time.time()
rf_k4_1000, rf_k4_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k4), classifier = 'rf', ntrees = 1000)
et4 = time.time()
print('Time for processing ', listOfk[4], ': ', et4 - st4)
st5 = time.time()
rf_k5_1000, rf_k5_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k5), classifier = 'rf', ntrees = 1000)
et5 = time.time()
print('Time for processing ', listOfk[5], ': ', et5 - st5)
st6 = time.time()
rf_k6_1000, rf_k6_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k6), classifier = 'rf', ntrees = 1000)
et6 = time.time()
print('Time for processing ', listOfk[6], ': ', et6 - st6)
st7 = time.time()
rf_k7_1000, rf_k7_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k7), classifier = 'rf', ntrees = 1000)
et7 = time.time()
print('Time for processing ', listOfk[7], ': ', et7 - st7)
st8 = time.time()
rf_raw_1000, rf_raw_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = X_values, classifier='rf', ntrees = 1000)
et8 = time.time()
print('Time for processing the raw data: ', et8 - st8)

In [None]:
st5 = time.time()
rf_k5_1000, rf_k5_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k5), classifier = 'rf', ntrees = 2000)
et5 = time.time()
print('Time for processing ', listOfk[5], ': ', et5 - st5)

In [None]:
st5 = time.time()
rf_k5_1000_top, rf_k5_2_1000_top = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k5), 
                                              classifier = 'rf', ntrees = 1000)
et5 = time.time()
print('Time for processing ', listOfk[5], ': ', et5 - st5)

In [None]:
st5 = time.time()
rf_k6_1000_top, rf_k6_2_1000_top = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k6), 
                                              classifier = 'rf', ntrees = 2000)
et5 = time.time()
print('Time for processing ', listOfk[5], ': ', et5 - st5)

In [None]:
df=pd.DataFrame(X_k5).mask(pd.DataFrame(X_k5)==0).fillna(pd.DataFrame(X_k5).mean())

In [None]:
st5 = time.time()
df_top, df_2_1000_top = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(df), 
                                              classifier = 'rf', ntrees = 1000)
et5 = time.time()
print('Time for processing ', listOfk[5], ': ', et5 - st5)

In [None]:
pd.DataFrame(df_top).describe()

In [None]:
results_as_tables = pd.concat([pd.DataFrame(res_k0_1).describe(), pd.DataFrame(res_k1_1).describe(),
                              pd.DataFrame(res_k2_1).describe(), pd.DataFrame(res_k3_1).describe(),
                              pd.DataFrame(res_k4_1).describe(), pd.DataFrame(res_k5_1).describe(),
                              pd.DataFrame(res_k6_1).describe(), pd.DataFrame(res_k7_1).describe(),
                               pd.DataFrame(q1).describe()],
                              axis=1)
results_to_latex = round(results_as_tables, 3)

results_to_latex = results_to_latex.set_axis(['k = 0.01', 'k = 0.05', ' k = 0.1', 'k = 0.2', 
                           'k = 0.25', 'k = 0.3', 'k = 0.4', 'k = 0.5', 'raw'], axis=1)

#pd.DataFrame(results_to_latex).style.to_latex('results/logit_results.tex')

In [None]:
results_to_latex

In [None]:
results_as_tables_rf_1000 = pd.concat([pd.DataFrame(rf_k0_1000).describe(),
                              pd.DataFrame(rf_k1_1000).describe(), pd.DataFrame(rf_k2_1000).describe(),
                              pd.DataFrame(rf_k3_1000).describe(), pd.DataFrame(rf_k4_1000).describe(),
                              pd.DataFrame(rf_k5_1000).describe(), pd.DataFrame(rf_k6_1000).describe(), 
                              pd.DataFrame(rf_k7_1000).describe(), pd.DataFrame(rf_raw_1000).describe()],
                              axis=1)

results_rf_to_latex_1000 = round(results_as_tables_rf_1000, 3)

results_rf_to_latex_1000 = results_rf_to_latex_1000.set_axis(['k = 0.01', 'k = 0.05', ' k = 0.1', 'k = 0.2', 
                           'k = 0.25', 'k = 0.3' ,'k = 0.4', 'k = 0.5', 'raw'], axis=1)

#pd.DataFrame(results_rf_to_latex_1000).style.to_latex('results/results_rf_to_latex_1000.tex')

In [None]:
results_rf_to_latex_1000

In [None]:
mtry = round(math.sqrt(X_values.shape[0]))

feature_importance_best_model = RandomForestClassifier(n_estimators= 1000, random_state= 1234, criterion = 'gini',
                                        max_features=mtry)

st = time.time()
feature_importance_best_model.fit(pd.DataFrame(X_k5), y.values.ravel())
et = time.time()
print('Time for processing the raw data: ', et - st)

In [None]:
mtry = round(math.sqrt(X_values.shape[0]))

feature_importance_best_model_k_40 = RandomForestClassifier(n_estimators= 1000, random_state= 1234, criterion = 'gini',
                                        max_features=mtry)

st = time.time()
feature_importance_best_model_k_40.fit(pd.DataFrame(X_k6), y.values.ravel())
et = time.time()
print('Time for processing the raw data: ', et - st)

In [None]:
mtry = round(math.sqrt(X_values.shape[0]))

feature_importance_raw = RandomForestClassifier(n_estimators= 1000, random_state= 1234, criterion = 'gini',
                                        max_features=mtry)

st = time.time()
feature_importance_raw.fit(pd.DataFrame(X_values), y.values.ravel())
et = time.time()

print('Time for processing the raw data: ', et - st)

In [None]:
importances = feature_importance_best_model.feature_importances_
forest_importances = pd.Series(importances)
std = np.std([tree.feature_importances_ for tree in feature_importance_best_model.estimators_], axis=0)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax= ax)
plt.xticks(np.arange(len(mz_values))[::300].round(), labels= (mz_values.to_numpy()[:,0][::300].round().astype(int)))

ax.set_title("Feature importances k = 30%")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
best_mod_impt = pd.DataFrame(dataMaldi['mzVector'])

best_mod_impt['imp'] = importances

pd.DataFrame(best_mod_impt)

best_mod_impt = best_mod_impt.sort_values(by='imp', ascending=False)

best_mod_impt[:30]

In [None]:
importances_raw = feature_importance_raw.feature_importances_

forest_importances_raw = pd.Series(importances_raw)

std = np.std([tree.feature_importances_ for tree in feature_importance_raw.estimators_], axis=0)

fig, ax = plt.subplots()
forest_importances_raw.plot.bar(yerr=std, ax= ax)
plt.xticks(np.arange(len(mz_values))[::300].round(), labels= (mz_values.to_numpy()[:,0][::300].round().astype(int)))

ax.set_title("Feature importances Raw Data")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
X_train = pd.DataFrame(X_k5).iloc[TMAs[TMAs != 6].index, :]

X_test = pd.DataFrame(X_k5).iloc[TMAs[TMAs == 6].index, :]

y_train = y.iloc[TMAs[TMAs != 6].index, :]

y_test = y.iloc[TMAs[TMAs == 6].index, :]



In [None]:
## Fine tune RF for the best model
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = 2000
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6]
# Method of selecting samples for training each tree
bootstrap = [True]
criterion = ["gini", "entropy", "log_loss"]

# Create the random grid
random_grid = {
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion':      criterion
               }



rfc=RandomForestClassifier(random_state=1234)
        
rf_random = RandomizedSearchCV(estimator = rfc, 
                               param_distributions = random_grid,
                               n_iter = 100, cv = 5, verbose=2, 
                               random_state=1234, n_jobs = -1, scoring = "balanced_accuracy")

In [None]:
rf_random.fit(X_train, y_train.values.ravel())


In [None]:

rf_random.best_params_


In [None]:
rf_random.best_estimator_

In [None]:
st8 = time.time()
rf_raw_1000, rf_raw_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = X_values, classifier='rf', ntrees = 2000)
et8 = time.time()

In [None]:
Average(rf_raw_1000)

In [None]:
st8 = time.time()
rf_k5_2000, rf_k5_2_1000 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k5), classifier='rf', ntrees = 2000)
et8 = time.time()

print('Time for processing the raw data: ', et8 - st8)

In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def cvAccuracyTun(tmas, X, resp, trees):
    
    tmas_names = ['TMA_1:', 'TMA_2:', 'TMA_3:', 'TMA_4:', 'TMA_5:', 'TMA_6:', 'TMA_7:', 'TMA_8:']
    
    tmas_res_test   = []
    tmas_vals_test  = []
    tmas_res_train  = []
    tmas_vals_train = []
    #WE fix this params in order to prevent from overfitting
    
    mtry =  round(math.sqrt(p))
    
    for tma in range(1, len(tmas_names)+1):
    
        X_train = X.iloc[tmas[tmas != tma].index, :]

        X_test = X.iloc[tmas[tmas == tma].index, :]

        y_train = resp.iloc[tmas[tmas != tma].index, :]

        y_test = resp.iloc[tmas[tmas == tma].index, :]
        
        max_features = ['log2', 'sqrt', round(p/10), round(p/12), round(p/15)] # Maximum number of levels in tree

        max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
        
        max_depth.append(None) # Minimum number of samples required to split a node

        min_samples_split = [2, 5, 10, 15, 20] # Minimum number of samples required at each leaf node

        min_samples_leaf = [1, 2, 4, 6] # Method of selecting samples for training each tree

        random_grid = {
                   'max_depth': max_depth,
                   'max_features': max_features,
                   'max_depth': max_depth
        }

        rfc = RandomForestClassifier(random_state=1234)
        
        rf_random = RandomizedSearchCV(estimator = rfc, 
                               param_distributions = random_grid,
                               n_iter = 50, 
                               cv = 5, verbose = 3, 
                               random_state=1234, 
                               n_jobs = -3, 
                               scoring = "f1")
            
        rf_random.fit(X_train, y_train.values.ravel())
            
        print(rf_random.best_params_)
        
        y_pred_rf = rf_random.predict(X_test)
            
        accuracy = balanced_accuracy_score(y_test, y_pred_rf)
            
        tmas_vals_test.append(accuracy)

        tmp_tmas = [tmas_names[tma-1], accuracy]

        tmas_res_test.append(tmp_tmas)
        
    return tmas_vals_test, tmas_res_test

In [None]:
tune_data = cvAccuracyTun(tmas =TMAs, resp=y, X = pd.DataFrame(X_k5), trees = 1000)

In [None]:
pd.DataFrame(tune_data[0]).describe()

In [None]:
Average(tune_data[0])

In [None]:
tune_data[0]