In [17]:
import math
import pyreadr
import numpy as np
import pandas as pd
import seaborn as sns
import json

import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

In [20]:
### Load MALDI data and helper functions
from tqdm import tqdm
import time

from pymatreader import read_mat

# Loading Data from Matlab
dataMaldi = read_mat('data/L1-8_tic_ad_sq.mat')

X_values = pd.DataFrame(dataMaldi['data_tic'])#Reading the mz-values 
                                #to get pd's as input for the ml
classes = pd.DataFrame(dataMaldi['classes'])#The cancer Typ, result for ml

mz_values = pd.DataFrame(dataMaldi['mzVector'])

TMAs   = pd.Series(dataMaldi['tmas'])

p = X_values.shape[0]

y = pd.DataFrame(np.where(classes == 1, 0, 1))

In [21]:
def recursionStart(spectra):
    featurePairs = []
    maxima = []
    minima = []
    
    for i in range(1, len(spectra) - 1):
        if spectra[i] > spectra[i-1]:
            if spectra[i] >= spectra[i+1]:
                for j in range(i+1, len(spectra)):
                    if spectra[i] > spectra[j]:
                        maxima.append([i, spectra[i]])
                        break
                    elif spectra[i] < spectra[j]:
                        break
        if spectra[i] < spectra[i-1]:
            if spectra[i] <= spectra[i+1]:
                for j in range(i+1, len(spectra)):
                    if spectra[i] < spectra[j]:
                        minima.append([i, spectra[i]])
                        break
                    elif spectra[i] > spectra[j]:
                        break
    maxima.sort(key = lambda element: element[1], reverse = True)
    minima.sort(key = lambda element: element[1])
    
    globalMaxima = maxima.pop(0)
    featurePairs.append([globalMaxima[0], globalMaxima[1] - minima[0][1]])
    
    recursionStep(0, globalMaxima[0], maxima.copy(), minima.copy(), featurePairs)
    recursionStep(len(spectra) - 1, globalMaxima[0], maxima.copy(), minima.copy(), featurePairs)
    
    return featurePairs

In [22]:
def recursionStep(start, end, maxima, minima, featurePairs):
    factor = 1
    if end < start:
        factor = -1
    currentMaxima = []
    for i in range(len(maxima)):
        position = maxima[i][0]
        if start * factor < position * factor and position * factor < end * factor:
            currentMaxima.append(maxima[i])
    if len(currentMaxima) == 0:
        return
    localMaxima = currentMaxima.pop(0)
    recursionStep(start, localMaxima[0], currentMaxima.copy(), minima.copy(), featurePairs)
    currentMinima = []
    for i in range(len(minima)):
        position = minima[i][0]
        if localMaxima[0] * factor < position * factor and position * factor < end:
            currentMinima.append(minima[i])
    localMinima = currentMinima.pop(0)
    featurePairs.append([localMaxima[0], localMaxima[1] - localMinima[1]])
    recursionStep(localMinima[0], localMaxima[0], currentMaxima.copy(), currentMinima.copy(), featurePairs)
    recursionStep(localMinima[0], end, currentMaxima.copy(), currentMinima.copy(), featurePairs)

In [23]:
def getPersistenceTransformation(data_X, k):
    spectras = []
    for i in range(len(data_X)):
        featurePairs = recursionStart(data_X[i])
        spectras.append(featurePairs[0 : round(k * len(featurePairs))])
    for i in range(len(spectras)):
        transformation = [0] * len(data_X[i])
        for x, fx in featurePairs:
            transformation[x] = fx
    return spectra

In [25]:
from sklearn import preprocessing

X = X_values

resp = y
tmas = TMAs
X_train = X.iloc[tmas[tmas != 3].index, :]

X_test = X.iloc[tmas[tmas == 3].index, :]

y_train = resp.iloc[tmas[tmas != 3].index, :]

y_test = resp.iloc[tmas[tmas == 3].index, :]

scaler = preprocessing.StandardScaler().fit(X_train)

In [26]:
X_scaled = scaler.transform(X_train)

estimator = LogisticRegression(random_state=12345, solver='newton-cg', penalty = 'l2')
grid = {
    'C': list(np.arange(0.0001, 0.3, 0.001)), 
}

In [27]:
gs = GridSearchCV(estimator, param_grid=grid, scoring='accuracy', cv=5)
gs.fit(X_scaled, y_train.values.ravel())
print(gs.best_score_) 

KeyboardInterrupt: 

In [None]:
plt.plot(mz_values, np.transpose( X_values.loc[[22]]))
plt.ylabel('Intensity')
plt.xlabel('m/z values')

In [None]:
k1 = 0.01
k2 = 0.05
k3 = 0.1
k4 = 0.2
k5 = 0.25
k6 = 0.5

In [None]:
X_k1 = getPersistenceTransformation(data_X = X_values, k = k1)
X_k2 = getPersistenceTransformation(data_X = X_values, k = k2)
X_k4 = getPersistenceTransformation(data_X = X_values, k = k4)

In [None]:
s1 = X_values.loc[[22]]
s2 = pd.DataFrame(X_k1).loc[[22]]
s3 = pd.DataFrame(X_k2).loc[[22]]
s4 = pd.DataFrame(X_k4).loc[[22]]

In [None]:
## First part of the experiments - carry out (8 fold) cross-validation based on each TMAs 
## There should be Figures (3) and (4) in the paper Leuschner et al. 
from sklearn.ensemble import RandomForestClassifier
def cvAccuracy(tmas, X, resp, classifier):
    
    tmas_names = ['L1:', 'L2:', 'L3:', 'L4:', 'L5:', 'L6:', 'L7:', 'L8:']
    
    tmas_res_test   = []
    tmas_vals_test  = []
    tmas_res_train  = []
    tmas_vals_train = []
    
    for tma in range(1, len(tmas_names)+1):
    
        X_train = X.iloc[tmas[tmas != tma].index, :]

        X_test = X.iloc[tmas[tmas == tma].index, :]

        y_train = resp.iloc[tmas[tmas != tma].index, :]

        y_test = resp.iloc[tmas[tmas == tma].index, :]
        
        if (classifier == 'log'):

            logreg = LogisticRegression(penalty = 'none', solver = 'newton-cg')

            logreg.fit(X_train, y_train.values.ravel())

            y_pred = logreg.predict_proba(X_test)

            y_pred_1 = np.where(y_pred[:, 1] >= 0.5, 1, 0)

            accuracy = accuracy_score(y_test, y_pred_1)
            
            tmas_vals_test.append(accuracy)

            tmp_tmas = [tmas_names[tma-1], accuracy]

            tmas_vals_test.append(tmp_tmas)
            
        if (classifier == 'rf'):
            
            rf = RandomForestClassifier(n_estimators= 500, random_state= 1234)
            
            rf.fit(X_train, y_train.values.ravel())
            
            y_pred_rf = rf.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_pred_rf)
            
            tmas_vals_test.append(accuracy)

            tmp_tmas = [tmas_names[tma-1], accuracy]

            tmas_res_test.append(tmp_tmas)
        
        if (classifier == 'lasso'):
           
           print('I am lasso')
        
         
         
        
    return tmas_vals_test, tmas_res_test  

def Average(lst):
    return sum(lst) / len(lst)

In [None]:
gs.best_params_

In [None]:
rf_raw_1, rf_raw_2 = cvAccuracy(tmas =TMAs, resp=y, X = X_values, classifier='rf')
rf_k1_1, rf_k1_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k1), classifier = 'rf')
rf_k2_1, rf_k2_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k2), classifier = 'rf')
rf_k3_1, rf_k3_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k3), classifier = 'rf')
rf_k4_1, rf_k4_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k4), classifier = 'rf')
rf_k5_1, rf_k5_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k5), classifier = 'rf')

In [None]:
print(Average(rf_raw_1), Average(rf_k1_1), Average(rf_k2_1), Average(rf_k3_1), Average(rf_k4_1), Average(rf_k5_1))

In [None]:
q1, q2 = cvAccuracy(tmas =TMAs, resp=y, X = X_values, classifier='log')

In [None]:
res_k1_1, res_k1_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k1), classifier = 'log')

In [None]:
res_k2_1, res_k2_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k2), classifier = 'log')

In [None]:
res_k3_1, res_k3_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k3), classifier = 'log')

In [None]:
res_k4_1, res_k4_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k4), classifier = 'log')

In [None]:
res_k5_1, res_k5_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k5), classifier = 'log')

In [None]:
res_k6_1, res_k6_2 = cvAccuracy(tmas =TMAs, resp=y, X = pd.DataFrame(X_k6), classifier = 'log')

In [None]:
print(Average(q1), Average(res_k1_1), Average(res_k2_1),
      Average(res_k3_1), Average(res_k4_1), Average(res_k5_1), Average(res_k6_1))

In [None]:
print(np.count_nonzero(s2), np.count_nonzero(s3), np.count_nonzero(s4))

In [None]:
results = pd.concat([pd.Series(q1), pd.Series(res_k1_1), pd.Series(res_k2_1), pd.Series(res_k3_1), 
                     pd.Series(res_k4_1), pd.Series(res_k5_1),
                     pd.Series(res_k5_1)], axis=1)

results = pd.DataFrame(results)

results.boxplot()
#sns.boxplot(results.iloc[:, 0:1])