In [1]:
from pathlib import Path
from ast import literal_eval
from collections import Counter
import pickle
from typing import Tuple

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
import tqdm

# Build merge of best results

In [51]:
# last taken adn_circulant_brute_force_14.log
file_name = 'adn_circulant_brute_force_fold_0.log'
data = pd.read_csv(Path('..') / 'results' / 'score2_end' / file_name, sep='|', header=0, names=['mean', 'std', 'scores_cv', 'features'])

In [52]:
data.sort_values('mean', inplace=True, ascending=False)
data.reset_index(drop=True, inplace=True)

In [53]:
nb_best_results = 50000
best_results_tmp = data.loc[: nb_best_results, :]

In [9]:
best_results_file_name = 'best_results_fold_0.log'
best_results = pd.read_csv(Path('..') / 'results' / 'score2_chir' / best_results_file_name, sep='|', header=0, names=['mean', 'std', 'scores_cv', 'features'])

In [11]:
best_results = pd.concat([best_results, best_results_tmp])
best_results.sort_values('mean', inplace=True, ascending=False)
best_results.reset_index(drop=True, inplace=True)

In [12]:
best_results.shape

(100002, 4)

In [54]:
best_results_file_name = 'best_results_fold_0.log'
best_results_tmp.to_csv(Path('..') / 'results' / 'score2_end' / best_results_file_name, sep='|', index=False)

In [10]:
best_results.head(10)

Unnamed: 0,mean,std,scores_cv,features
0,77.7,6.4,[0.77272727 0.86363636 0.77272727 0.80952381 0...,"['7:63000001-64000000_zscore', '9:88000001-890..."
1,76.9,3.7,[0.72727273 0.77272727 0.72727273 0.80952381 0...,"['7:34000001-35000000_zscore', '13:109000001-1..."
2,76.8,3.3,[0.77272727 0.77272727 0.81818182 0.76190476 0...,"['8:11000001-12000000_zscore', '12:68000001-69..."
3,76.8,9.1,[0.72727273 0.77272727 0.86363636 0.85714286 0...,"['3:155000001-156000000_zscore', '17:56000001-..."
4,76.7,6.4,[0.81818182 0.81818182 0.81818182 0.66666667 0...,"['17:68000001-69000000_zscore', '7:94000001-95..."
5,76.0,7.8,[0.72727273 0.68181818 0.77272727 0.9047619 0...,"['6:7000001-8000000_zscore', '3:35000001-36000..."
6,76.0,4.2,[0.68181818 0.77272727 0.77272727 0.80952381 0...,"['3:195000001-196000000_zscore', '4:38000001-3..."
7,76.0,6.0,[0.77272727 0.77272727 0.68181818 0.85714286 0...,"['22:49000001-50000000_zscore', '20:37000001-3..."
8,75.1,5.1,[0.68181818 0.72727273 0.72727273 0.80952381 0...,"['12:104000001-105000000_zscore', '1:217000001..."
9,75.1,7.3,[0.81818182 0.68181818 0.68181818 0.85714286 0...,"['4:128000001-129000000_zscore', '15:57000001-..."


# Load data

In [12]:
data = best_results

In [4]:
file_name = 'best_results_new.log'
data = pd.read_csv(Path('..') / 'results' / 'score2' / file_name, sep='|', header=0, names=['mean', 'std', 'scores_cv', 'features'])

In [5]:
data.sort_values('mean', inplace=True, ascending=False)
data.reset_index(drop=True, inplace=True)

In [26]:
n_cv=1000
nb_feat = 500

features_list = [literal_eval(x) for x in data.loc[:n_cv,  'features'].to_list()]
features_flat_list = [item for sublist in features_list for item in sublist]

c = Counter(features_flat_list)
[x[0] for x in c.most_common()[:nb_feat]]

['21:28000001-29000000_ratio',
 '9:20000001-21000000_ratio',
 '6:61000001-62000000_ratio',
 '5:37000001-38000000_ratio',
 '5:31000001-32000000_ratio',
 '16:77000001-78000000_ratio',
 '22:19000001-20000000_ratio',
 '5:28000001-29000000_ratio',
 '5:26000001-27000000_ratio',
 '21:31000001-32000000_ratio',
 '2:149000001-150000000_ratio',
 '2:232000001-233000000_ratio',
 '21:33000001-34000000_ratio',
 '5:11000001-12000000_ratio',
 '13:68000001-69000000_ratio',
 '5:30000001-31000000_ratio',
 '21:32000001-33000000_ratio',
 '14:104000001-105000000_ratio',
 '21:34000001-35000000_ratio',
 '21:27000001-28000000_ratio',
 '9:71000001-72000000_ratio',
 '13:112000001-113000000_ratio',
 '21:24000001-25000000_ratio',
 '22:21000001-22000000_ratio',
 '5:50000001-51000000_ratio',
 '8:1-1000000_ratio',
 '5:32000001-33000000_ratio',
 '20:55000001-56000000_ratio',
 '3:196000001-197000000_ratio',
 '17:65000001-66000000_ratio',
 '3:171000001-172000000_ratio',
 '3:186000001-187000000_ratio',
 '3:177000001-17800

# Train model - case Train / validation / Test - at diagnostic

## Load Data

In [2]:
data = pd.read_csv(Path('..') / 'results' / 'score2' / 'best_results_new.log', sep='|', header=0, names=['mean', 'std', 'scores_cv', 'features'])
data.sort_values('mean', inplace=True, ascending=False)
data.reset_index(drop=True, inplace=True)

X = pickle.load(open(Path('..') / 'features' / 'score2' / 'X_train.pkl', 'rb')).reset_index(drop=True)
y = pd.DataFrame(pickle.load(open(Path('..') / 'features' / 'score2' / 'y_train.pkl', 'rb')))
y = y.reset_index(drop=True)

X_test = pickle.load(open(Path('..') / 'features' / 'score2' / 'X_test.pkl', 'rb')).reset_index(drop=True)
y_test = pd.DataFrame(pickle.load(open(Path('..') / 'features' / 'score2' / 'y_test.pkl', 'rb')))
y_test = y_test.reset_index(drop=True)

In [3]:
y.value_counts()

0    66
1    53
dtype: int64

In [4]:
y_test.value_counts()

0    28
1    22
dtype: int64

## Find best features

In [5]:
def model_manual_train_test(X_train, y_train, X_test, y_test):
    lr = LogisticRegression(C=0.01, class_weight='balanced', random_state=5)
    ss = StandardScaler().fit(X_train)
    X_train_ss = ss.transform(X_train)
    lr.fit(X_train_ss, y_train)
    y_train_pred = lr.predict(X_train_ss)

    train_acc = accuracy_score(y_train_pred, y_train)
    train_scores = precision_recall_fscore_support(y_train, y_train_pred, labels=[0, 1])

    X_test_ss = ss.transform(X_test)
    y_test_pred = lr.predict(X_test_ss)

    test_acc = accuracy_score(y_test_pred, y_test)
    test_scores = precision_recall_fscore_support(y_test, y_test_pred, labels=[0, 1])
    return train_acc, train_scores, test_acc, test_scores

In [6]:
def compute_cv_on_different_numbers_of_features(X, y, data, nb_features, nb_experiments):
    res = {}
    skf = StratifiedKFold(n_splits=3, random_state=5, shuffle=True)

    for n_cv in tqdm.tqdm(nb_experiments):

        features_list = [literal_eval(x) for x in data.loc[:n_cv,  'features'].to_list()]
        features_flat_list = [item for sublist in features_list for item in sublist]

        c = Counter(features_flat_list)


        for nb_feat in nb_features:
            features_name = [x[0] for x in c.most_common()[:nb_feat]]
            features_name = [x.replace('zscore', 'ratio') for x in features_name]

            X_light = X.loc[:, features_name]

            scores_cv = []
            for i, (idx_train, idx_test) in enumerate(skf.split(X_light, y)):
                train_acc, train_scores, test_acc, test_scores = model_manual_train_test(X_light.loc[idx_train, :],
                                                                                             y.loc[idx_train, 'relapse'].tolist(),
                                                                                             X_light.loc[idx_test, :],
                                                                                             y.loc[idx_test, 'relapse'].tolist())
            scores_cv.append(test_acc)

            res[f'{nb_feat}-{n_cv}'] = np.mean(scores_cv)

    return res
    


In [7]:
y.rename(columns={0: 'relapse'}, inplace=True)
y_test.rename(columns={0: 'relapse'}, inplace=True)

nb_features = range(10, 100)
nb_experiments = range(50, 1000, 50)

res = compute_cv_on_different_numbers_of_features(X, y, data, nb_features, nb_experiments)

100%|██████████████████████████████████████████████████| 19/19 [00:27<00:00,  1.46s/it]


In [8]:
res_df = pd.DataFrame({'nb_feat': list(res.keys()), 'score': list(res.values())})
res_df.sort_values('score', ascending=False)[:50]

Unnamed: 0,nb_feat,score
1423,83-800,0.871795
420,70-250,0.871795
1497,67-850,0.871795
448,98-250,0.871795
447,97-250,0.871795
446,96-250,0.871795
445,95-250,0.871795
444,94-250,0.871795
441,91-250,0.871795
440,90-250,0.871795


## Compute best features

In [9]:
n_cv = 850
nb_feat = 67

features_list = [literal_eval(x) for x in data.loc[:n_cv,  'features'].to_list()]
features_flat_list = [item for sublist in features_list for item in sublist]
c = Counter(features_flat_list)
features_name = [x[0] for x in c.most_common()[:nb_feat]]
features_name = [x.replace('zscore', 'ratio') for x in features_name]

## Result on test set

In [10]:
train_acc, train_scores, test_acc, test_scores = model_manual_train_test(X[features_name], y['relapse'], X_test[features_name], y_test['relapse'])

In [11]:
test_acc

0.74

In [68]:
pd.DataFrame(train_scores, columns=[0, 1], index= ['precision', 'rappel', 'f1score', 'support'])

Unnamed: 0,0,1
precision,0.775,0.897436
rappel,0.939394,0.660377
f1score,0.849315,0.76087
support,66.0,53.0


In [69]:
pd.DataFrame(test_scores, columns=[0, 1], index= ['precision', 'rappel', 'f1score', 'support'])

Unnamed: 0,0,1
precision,0.69697,0.705882
rappel,0.821429,0.545455
f1score,0.754098,0.615385
support,28.0,22.0


### 500 features that appear the most

In [36]:
n_cv=1000
nb_feat = 500

features_list = [literal_eval(x) for x in data.loc[:n_cv,  'features'].to_list()]
features_flat_list = [item for sublist in features_list for item in sublist]

c = Counter(features_flat_list)
[x[0] for x in c.most_common()[:nb_feat]]

['21:28000001-29000000_ratio',
 '9:20000001-21000000_ratio',
 '6:61000001-62000000_ratio',
 '5:37000001-38000000_ratio',
 '5:31000001-32000000_ratio',
 '16:77000001-78000000_ratio',
 '22:19000001-20000000_ratio',
 '5:28000001-29000000_ratio',
 '5:26000001-27000000_ratio',
 '21:31000001-32000000_ratio',
 '2:149000001-150000000_ratio',
 '2:232000001-233000000_ratio',
 '21:33000001-34000000_ratio',
 '5:11000001-12000000_ratio',
 '13:68000001-69000000_ratio',
 '5:30000001-31000000_ratio',
 '21:32000001-33000000_ratio',
 '14:104000001-105000000_ratio',
 '21:34000001-35000000_ratio',
 '21:27000001-28000000_ratio',
 '9:71000001-72000000_ratio',
 '13:112000001-113000000_ratio',
 '21:24000001-25000000_ratio',
 '22:21000001-22000000_ratio',
 '5:50000001-51000000_ratio',
 '8:1-1000000_ratio',
 '5:32000001-33000000_ratio',
 '20:55000001-56000000_ratio',
 '3:196000001-197000000_ratio',
 '17:65000001-66000000_ratio',
 '3:171000001-172000000_ratio',
 '3:186000001-187000000_ratio',
 '3:177000001-17800

# Train model - case Train / validation / Test - at chirurgy
## Load data

In [12]:
data = pd.read_csv(Path('..') / 'results' / 'score2_chir' / 'best_results_fold_0.log', sep='|', header=0, names=['mean', 'std', 'scores_cv', 'features'])
data.sort_values('mean', inplace=True, ascending=False)
data.reset_index(drop=True, inplace=True)

X = pickle.load(open(Path('..') / 'features' / 'score2_time_chir' / 'X_train_0.pkl', 'rb'))
X.reset_index(drop=True, inplace=True)
y = pd.DataFrame(pickle.load(open(Path('..') / 'features' / 'score2_time_chir' / 'y_train_0.pkl', 'rb')))
y.reset_index(drop=True, inplace=True)

X_test = pickle.load(open(Path('..') / 'features' / 'score2_time_chir' / 'X_test_0.pkl', 'rb'))
y_test = pd.DataFrame(pickle.load(open(Path('..') / 'features' / 'score2_time_chir' / 'y_test_0.pkl', 'rb')))

## Find best features

In [14]:
nb_features = range(10, 100)
nb_experiments = range(50, 1000, 50)

res = compute_cv_on_different_numbers_of_features(X, y, data, nb_features, nb_experiments)

100%|██████████████████████████████████████████████████| 19/19 [00:27<00:00,  1.43s/it]


In [15]:
res_df = pd.DataFrame({'nb_feat': list(res.keys()), 'score': list(res.values())})
res_df.sort_values('score', ascending=False)[:50]

Unnamed: 0,nb_feat,score
43,53-50,0.65625
40,50-50,0.65625
42,52-50,0.65625
717,97-400,0.65625
965,75-550,0.625
1352,12-800,0.625
967,77-550,0.625
1662,52-950,0.625
1661,51-950,0.625
1660,50-950,0.625


## Compute best features

In [21]:
n_cv = 50
nb_feat = 50

features_list = [literal_eval(x) for x in data.loc[:n_cv,  'features'].to_list()]
features_flat_list = [item for sublist in features_list for item in sublist]
c = Counter(features_flat_list)
features_name = [x[0] for x in c.most_common()[:nb_feat]]
features_name = [x.replace('zscore', 'ratio') for x in features_name]

## Result on test set

In [22]:
train_acc, train_scores, test_acc, test_scores = model_manual_train_test(X[features_name], y['relapse'], X_test[features_name], y_test['relapse'])

In [23]:
test_acc

0.48717948717948717

In [24]:
pd.DataFrame(train_scores, columns=[0, 1], index= ['precision', 'rappel', 'f1score', 'support'])

Unnamed: 0,0,1
precision,0.803571,0.804878
rappel,0.849057,0.75
f1score,0.825688,0.776471
support,53.0,44.0


In [25]:
pd.DataFrame(test_scores, columns=[0, 1], index= ['precision', 'rappel', 'f1score', 'support'])

Unnamed: 0,0,1
precision,0.545455,0.411765
rappel,0.545455,0.411765
f1score,0.545455,0.411765
support,22.0,17.0


# Train model - case Train / validation / Test - at end of treatment
## Load data

In [26]:
data = pd.read_csv(Path('..') / 'results' / 'score2_end' / 'best_results_fold_0.log', sep='|', header=0, names=['mean', 'std', 'scores_cv', 'features'])
data.sort_values('mean', inplace=True, ascending=False)
data.reset_index(drop=True, inplace=True)

X = pickle.load(open(Path('..') / 'features' / 'score2_time_end' / 'X_train_0.pkl', 'rb'))
X.reset_index(drop=True, inplace=True)
y = pd.DataFrame(pickle.load(open(Path('..') / 'features' / 'score2_time_end' / 'y_train_0.pkl', 'rb')))
y.reset_index(drop=True, inplace=True)

X_test = pickle.load(open(Path('..') / 'features' / 'score2_time_end' / 'X_test_0.pkl', 'rb'))
y_test = pd.DataFrame(pickle.load(open(Path('..') / 'features' / 'score2_time_end' / 'y_test_0.pkl', 'rb')))

## Find best features

In [28]:
nb_features = range(10, 100)
nb_experiments = range(50, 1000, 50)

res = compute_cv_on_different_numbers_of_features(X, y, data, nb_features, nb_experiments)

100%|██████████████████████████████████████████████████| 19/19 [00:25<00:00,  1.36s/it]


In [29]:
res_df = pd.DataFrame({'nb_feat': list(res.keys()), 'score': list(res.values())})
res_df.sort_values('score', ascending=False)[:50]

Unnamed: 0,nb_feat,score
89,99-50,0.709677
288,28-200,0.709677
993,13-600,0.709677
281,21-200,0.709677
992,12-600,0.709677
90,10-100,0.709677
1172,12-700,0.709677
902,12-550,0.709677
903,13-550,0.709677
904,14-550,0.709677


## Compute best features

In [30]:
n_cv = 10
nb_feat = 100

features_list = [literal_eval(x) for x in data.loc[:n_cv,  'features'].to_list()]
features_flat_list = [item for sublist in features_list for item in sublist]
c = Counter(features_flat_list)
features_name = [x[0] for x in c.most_common()[:nb_feat]]
features_name = [x.replace('zscore', 'ratio') for x in features_name]

## Result on test set

In [31]:
train_acc, train_scores, test_acc, test_scores = model_manual_train_test(X[features_name], y['relapse'], X_test[features_name], y_test['relapse'])

In [32]:
test_acc

0.6190476190476191

In [33]:
pd.DataFrame(train_scores, columns=[0, 1], index= ['precision', 'rappel', 'f1score', 'support'])

Unnamed: 0,0,1
precision,0.865385,0.804878
rappel,0.849057,0.825
f1score,0.857143,0.814815
support,53.0,40.0


In [34]:
pd.DataFrame(test_scores, columns=[0, 1], index= ['precision', 'rappel', 'f1score', 'support'])

Unnamed: 0,0,1
precision,0.695652,0.526316
rappel,0.64,0.588235
f1score,0.666667,0.555556
support,25.0,17.0
