In [10]:
from pathlib import Path
from ast import literal_eval
from collections import Counter
import pickle
from typing import Tuple

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
import tqdm

# Build merge of best results

In [3]:
# last taken adn_circulant_brute_force_14.log
file_name = 'adn_circulant_brute_force_1.log'
data = pd.read_csv(Path('..') / 'results' / 'score2' / file_name, sep='|', header=0, names=['mean', 'std', 'scores_cv', 'features'])

In [4]:
data.sort_values('mean', inplace=True, ascending=False)
data.reset_index(drop=True, inplace=True)

In [5]:
nb_best_results = 50000
best_results_tmp = data.loc[: nb_best_results, :]

In [6]:
best_results_file_name = 'best_results_new.log'
best_results = pd.read_csv(Path('..') / 'results' / 'score2' / best_results_file_name, sep='|', header=0, names=['mean', 'std', 'scores_cv', 'features'])

In [7]:
best_results.shape

(50001, 4)

In [8]:
best_results = pd.concat([best_results, best_results_tmp])
best_results.sort_values('mean', inplace=True, ascending=False)
best_results.reset_index(drop=True, inplace=True)

In [9]:
best_results.shape

(100002, 4)

In [11]:
best_results_file_name = 'best_results_new.log'
best_results.to_csv(Path('..') / 'results' / 'score2' / best_results_file_name, sep='|', index=False)

# Load data

In [12]:
data = best_results

In [3]:
file_name = 'best_results.log'
data = pd.read_csv(Path('..') / 'results' / 'score2' / file_name, sep='|', header=0, names=['mean', 'std', 'scores_cv', 'features'])

In [4]:
data.sort_values('mean', inplace=True, ascending=False)
data.reset_index(drop=True, inplace=True)

In [5]:
data.head(100)

Unnamed: 0,mean,std,scores_cv,features
0,74.0,4.6,[0.70833333 0.75 0.70833333 0.70833333 0...,"['10:101000001-102000000_ratio', '10:11000001-..."
1,73.2,4.8,[0.70833333 0.79166667 0.66666667 0.70833333 0...,"['3:134000001-135000000_ratio', '15:78000001-7..."
2,73.1,4.2,[0.70833333 0.79166667 0.75 0.66666667 0...,"['5:31000001-32000000_ratio', '3:22000001-2300..."
3,73.1,3.6,[0.70833333 0.79166667 0.70833333 0.75 0...,"['18:60000001-61000000_ratio', '9:95000001-960..."
4,72.4,6.0,[0.70833333 0.75 0.66666667 0.66666667 0...,"['1:178000001-179000000_ratio', '10:121000001-..."
...,...,...,...,...
95,71.5,6.9,[0.70833333 0.75 0.625 0.66666667 0...,"['1:35000001-36000000_ratio', '9:8000001-90000..."
96,71.5,6.9,[0.70833333 0.75 0.66666667 0.625 0...,"['9:98000001-99000000_ratio', '4:4000001-50000..."
97,71.5,5.9,[0.66666667 0.79166667 0.66666667 0.66666667 0...,"['12:28000001-29000000_ratio', '13:71000001-72..."
98,71.5,6.4,[0.66666667 0.75 0.66666667 0.66666667 0...,"['11:118000001-119000000_ratio', '7:101000001-..."


# Train model

## Load Data

In [150]:
X = pickle.load(open(Path('..') / 'features' / 'score2' / 'X_train.pkl', 'rb')).reset_index(drop=True)
y = pd.DataFrame(pickle.load(open(Path('..') / 'features' / 'score2' / 'y_train.pkl', 'rb')))

X_test = pickle.load(open(Path('..') / 'features' / 'score2' / 'X_test.pkl', 'rb')).reset_index(drop=True)
y_test = pd.DataFrame(pickle.load(open(Path('..') / 'features' / 'score2' / 'y_test.pkl', 'rb')))

In [151]:
y[0].value_counts()

0    66
1    53
Name: 0, dtype: int64

In [152]:
y_test[0].value_counts()

0    28
1    22
Name: 0, dtype: int64

## Find best features

In [159]:
def model_manual_train_test(X_train, y_train, X_test, y_test):
    lr = LogisticRegression(C=0.01, class_weight='balanced', random_state=5)
    ss = StandardScaler().fit(X_train)
    X_train_ss = ss.transform(X_train)
    lr.fit(X_train_ss, y_train)
    y_train_pred = lr.predict(X_train_ss)

    train_acc = accuracy_score(y_train_pred, y_train)
    train_scores = precision_recall_fscore_support(y_train, y_train_pred, labels=[0, 1])

    X_test_ss = ss.transform(X_test)
    y_test_pred = lr.predict(X_test_ss)

    test_acc = accuracy_score(y_test_pred, y_test)
    test_scores = precision_recall_fscore_support(y_test, y_test_pred, labels=[0, 1])
    return train_acc, train_scores, test_acc, test_scores

In [138]:
nb_features = range(10, 100)
res = {}
skf = StratifiedKFold(n_splits=3, random_state=5, shuffle=True)

for n_cv in tqdm.tqdm(range(50, 1000, 50)):

    features_list = [literal_eval(x) for x in data.loc[:n_cv,  'features'].to_list()]
    features_flat_list = [item for sublist in features_list for item in sublist]

    c = Counter(features_flat_list)


    for nb_feat in nb_features:
        features_name = [x[0] for x in c.most_common()[:nb_feat]]
        features_name = [x.replace('zscore', 'ratio') for x in features_name]

        X_light = X.loc[:, features_name]
        scores_cv = []
        for i, (idx_train, idx_test) in enumerate(skf.split(X_light, y)):
            train_acc, train_scores, test_acc, test_scores = model_manual_train_test(X_light.loc[idx_train, :],
                                                                                         y.loc[idx_train, 0].tolist(),
                                                                                         X_light.loc[idx_test, :],
                                                                                         y.loc[idx_test, 0].tolist())
        scores_cv.append(test_acc)

        res[f'{nb_feat}-{n_cv}'] = np.mean(scores_cv)


100%|███████████████████████████████████████████| 19/19 [00:28<00:00,  1.52s/it]


In [139]:
res_df = pd.DataFrame({'nb_feat': list(res.keys()), 'score': list(res.values())})
res_df.sort_values('score', ascending=False)[:50]

Unnamed: 0,nb_feat,score
1423,83-800,0.871795
420,70-250,0.871795
1497,67-850,0.871795
448,98-250,0.871795
447,97-250,0.871795
446,96-250,0.871795
445,95-250,0.871795
444,94-250,0.871795
441,91-250,0.871795
440,90-250,0.871795


### Compute best features

In [143]:
n_cv = 850
nb_feat = 67

features_list = [literal_eval(x) for x in data.loc[:n_cv,  'features'].to_list()]
features_flat_list = [item for sublist in features_list for item in sublist]
c = Counter(features_flat_list)
features_name = [x[0] for x in c.most_common()[:nb_feat]]
features_name = [x.replace('zscore', 'ratio') for x in features_name]

In [171]:
features_name

['21:28000001-29000000_ratio',
 '9:20000001-21000000_ratio',
 '6:61000001-62000000_ratio',
 '16:77000001-78000000_ratio',
 '5:31000001-32000000_ratio',
 '5:37000001-38000000_ratio',
 '22:19000001-20000000_ratio',
 '5:28000001-29000000_ratio',
 '5:26000001-27000000_ratio',
 '2:232000001-233000000_ratio',
 '9:71000001-72000000_ratio',
 '14:104000001-105000000_ratio',
 '21:33000001-34000000_ratio',
 '21:32000001-33000000_ratio',
 '13:68000001-69000000_ratio',
 '21:31000001-32000000_ratio',
 '5:11000001-12000000_ratio',
 '2:149000001-150000000_ratio',
 '22:21000001-22000000_ratio',
 '20:55000001-56000000_ratio',
 '5:32000001-33000000_ratio',
 '8:1-1000000_ratio',
 '3:171000001-172000000_ratio',
 '21:34000001-35000000_ratio',
 '21:24000001-25000000_ratio',
 '5:30000001-31000000_ratio',
 '14:85000001-86000000_ratio',
 '21:27000001-28000000_ratio',
 '17:65000001-66000000_ratio',
 '5:50000001-51000000_ratio',
 '5:52000001-53000000_ratio',
 '3:116000001-117000000_ratio',
 '3:175000001-176000000

### Result on test set

In [160]:
train_acc, train_scores, test_acc, test_scores = model_manual_train_test(X[features_name], y[0], X_test[features_name], y_test[0])

In [162]:
test_acc

0.74

In [163]:
pd.DataFrame(train_scores, columns=[0, 1], index= ['precision', 'rappel', 'f1score', 'support'])

Unnamed: 0,0,1
precision,0.824324,0.888889
rappel,0.924242,0.754717
f1score,0.871429,0.816327
support,66.0,53.0


In [164]:
pd.DataFrame(test_scores, columns=[0, 1], index= ['precision', 'rappel', 'f1score', 'support'])

Unnamed: 0,0,1
precision,0.727273,0.764706
rappel,0.857143,0.590909
f1score,0.786885,0.666667
support,28.0,22.0


### Save dataset

In [165]:
X = pickle.load(open(Path('..') / 'features' / 'score2' / 'X_train.pkl', 'rb'))
y = pd.DataFrame(pickle.load(open(Path('..') / 'features' / 'score2' / 'y_train.pkl', 'rb')))

X_test = pickle.load(open(Path('..') / 'features' / 'score2' / 'X_test.pkl', 'rb'))
y_test = pd.DataFrame(pickle.load(open(Path('..') / 'features' / 'score2' / 'y_test.pkl', 'rb')))

In [167]:
X_old = pickle.load(open('X_old.pkl', 'rb'))

In [169]:
X_old

Unnamed: 0,patient_id,train_test,21:28000001-29000000_ratio,9:20000001-21000000_ratio,6:61000001-62000000_ratio,16:77000001-78000000_ratio,5:31000001-32000000_ratio,21:31000001-32000000_ratio,5:37000001-38000000_ratio,2:232000001-233000000_ratio,...,17:36000001-37000000_ratio,12:17000001-18000000_ratio,21:19000001-20000000_ratio,2:129000001-130000000_ratio,4:53000001-54000000_ratio,13:97000001-98000000_ratio,5:12000001-13000000_ratio,3:171000001-172000000_ratio,18:50000001-51000000_ratio,14:60000001-61000000_ratio
0,OS2006_273,test,0.062834,0.060903,-0.075174,-0.009873,0.005921,0.011209,0.060570,-0.034057,...,0.106297,0.052639,-0.022752,-0.010561,-0.013309,-0.035282,-0.011241,-0.001038,-0.004380,-0.053608
1,OS2006_37,train,-0.000728,-0.112753,-0.142701,-0.126674,-0.062127,-0.003244,-0.063724,-0.086081,...,-0.144730,0.042280,0.018058,-0.078092,0.013936,0.102973,-0.023663,0.009103,-0.017173,0.224828
2,OS2006_320,train,0.356692,-0.045632,0.031774,-0.058918,-0.113911,0.344343,0.360446,-0.037755,...,-0.151119,-0.021575,0.367760,-0.041620,-0.039667,-0.029173,0.486258,-0.057283,0.144363,0.046741
3,OS2006_504,train,-0.081113,0.019737,-0.025034,-0.010209,-0.006571,0.001794,0.007375,-0.146886,...,-0.083220,-0.131057,-0.035838,0.023544,0.225689,-0.041951,0.031141,0.038984,0.070752,0.048628
4,OS2006_454,test,0.026692,0.013272,-0.049709,0.041606,-0.011879,-0.016427,-0.018377,-0.011646,...,-0.030301,0.044123,0.059432,-0.018786,0.034758,0.005759,0.014072,0.001059,0.032991,0.021047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,OS2006_74,train,-0.024652,-0.021245,-0.087889,-0.072722,-0.017987,0.009356,0.007126,0.001013,...,-0.062483,-0.021865,0.021613,0.014264,-0.025489,-0.032146,-0.016796,-0.026740,0.072187,0.051119
165,OS2006_291,train,-0.063568,0.002128,0.072458,-0.058269,0.023463,-0.056003,0.045950,-0.015302,...,0.056713,-0.006161,0.074799,-0.067544,-0.030608,-0.020034,-0.028108,-0.098673,0.027852,0.089052
166,OS2006_425,test,-0.069869,0.008500,0.081977,-0.112373,-0.070604,-0.002005,-0.061020,0.026645,...,0.032717,0.095965,0.018591,0.032270,-0.035155,-0.181807,-0.088126,0.045910,-0.028561,-0.003869
167,OS2006_196,train,0.003242,-0.165564,-0.001599,-0.044437,0.013476,-0.017908,0.033219,-0.218379,...,0.063494,0.027925,0.025055,-0.011760,-0.015748,-0.281740,0.079859,0.255839,-0.057425,0.030006


In [170]:
X_test

Unnamed: 0,1:1000001-2000000_ratio,1:3000001-4000000_ratio,1:4000001-5000000_ratio,1:5000001-6000000_ratio,1:6000001-7000000_ratio,1:7000001-8000000_ratio,1:8000001-9000000_ratio,1:9000001-10000000_ratio,1:10000001-11000000_ratio,1:11000001-12000000_ratio,...,22:44000001-45000000_ratio,22:45000001-46000000_ratio,22:46000001-47000000_ratio,22:47000001-48000000_ratio,22:48000001-49000000_ratio,22:49000001-50000000_ratio,22:50000001-51000000_ratio,18:45000001-46000000_ratio,2:225000001-226000000_ratio,8:76000001-77000000_ratio
111,-0.063351,-0.009193,0.008477,0.041692,-0.053666,-0.018023,-0.0465,0.012257,0.030538,0.010412,...,0.02582,-0.019515,8.6e-05,-0.009316,0.014158,0.014934,-0.010914,-0.022585,0.0,0.0
163,0.131603,0.130085,0.007131,0.002434,0.02144,0.050174,-0.017633,0.02002,0.015982,-0.000774,...,0.066519,0.045119,0.068204,0.056102,0.065687,0.10683,0.110137,-0.040208,-0.019207,0.025165
25,-0.027564,0.064778,0.060186,0.075565,-0.004068,0.061323,-0.067167,0.011481,0.001195,0.018262,...,0.090773,0.002657,0.060288,0.077074,0.080583,0.070383,-0.03174,-0.046511,0.0,0.0
59,0.112409,0.060521,0.007909,0.007661,0.005149,0.002325,-0.004929,-0.019146,0.035117,-0.00317,...,0.035458,0.052725,0.028707,0.012895,0.053724,0.031396,0.059362,-0.011888,0.008041,0.0
115,0.174252,0.094431,0.073468,0.051015,0.026419,0.002695,-0.086992,0.007385,-0.021503,0.032595,...,0.088026,-0.035081,0.062266,0.051957,0.09433,0.077596,0.043069,-0.266428,0.011807,0.21341
141,0.26772,0.140463,0.099061,0.047311,0.099064,0.098481,0.005051,-0.024307,0.037992,-0.000344,...,0.089094,0.045175,0.037254,0.076158,0.081601,0.040639,-0.009698,-0.059191,0.0,0.0
159,0.207028,0.217296,0.176932,0.195505,0.194636,0.181525,0.18858,0.229822,0.176277,0.162045,...,0.060241,0.076543,0.101818,0.075813,0.081378,0.076952,0.102235,0.063992,0.0,0.0
41,0.048723,0.068137,0.039216,0.034657,0.01492,0.035969,-0.028629,-0.002038,0.008732,0.030265,...,0.021774,0.020004,0.01682,0.026001,0.033595,0.005639,0.040011,0.001787,0.0,0.0
20,0.093758,0.034183,0.01093,0.012587,0.035509,0.005814,-0.012653,-0.020909,0.025779,-0.023373,...,0.032469,0.03219,0.071523,0.091848,0.083739,0.1016,0.07338,-0.01166,-0.012532,0.009105
54,-0.11585,-0.117428,-0.035972,-0.030489,-0.08478,-0.085309,-0.079568,-0.099771,-0.101971,-0.063099,...,0.019853,-0.023302,-0.006291,0.062507,0.050017,0.024279,-0.017162,0.009471,0.0,0.0
