In [1]:
from pathlib import Path
from ast import literal_eval
from collections import Counter
import pickle

import pandas as pd
import numpy as np

# Load data

In [15]:
file_name = 'adn_circulant_brute_force_11.log'
data = pd.read_csv(Path('..') / 'results' / file_name, sep='|', header=0, names=['mean', 'std', 'scores_cv', 'features'])

In [16]:
data.sort_values('mean', inplace=True, ascending=False)
data.reset_index(drop=True, inplace=True)

In [17]:
data.head(100)

Unnamed: 0,mean,std,scores_cv,features
0,72.1,3.9,[0.76666667 0.66666667 0.72413793 0.75862069 0...,"['2:186000001-187000000_ratio', '11:131000001-..."
1,72.1,1.6,[0.73333333 0.73333333 0.68965517 0.72413793 0...,"['13:19000001-20000000_ratio', '2:10000001-110..."
2,72.1,4.2,[0.73333333 0.76666667 0.68965517 0.75862069 0...,"['16:11000001-12000000_ratio', '2:174000001-17..."
3,72.1,5.4,[0.66666667 0.76666667 0.65517241 0.79310345 0...,"['9:34000001-35000000_ratio', '16:63000001-640..."
4,72.1,2.4,[0.73333333 0.7 0.68965517 0.75862069 0...,"['12:62000001-63000000_ratio', '18:29000001-30..."
...,...,...,...,...
95,70.7,3.1,[0.73333333 0.73333333 0.68965517 0.72413793 0...,"['3:22000001-23000000_ratio', '15:82000001-830..."
96,70.7,4.7,[0.76666667 0.66666667 0.68965517 0.75862069 0...,"['10:62000001-63000000_ratio', '8:9000001-1000..."
97,70.7,2.8,[0.7 0.73333333 0.72413793 0.72413793 0...,"['14:88000001-89000000_ratio', '5:41000001-420..."
98,70.7,5.2,[0.7 0.73333333 0.65517241 0.79310345 0...,"['12:102000001-103000000_ratio', '15:88000001-..."


# Find commun features

In [18]:
n_cv = 5000
features_list = [literal_eval(x) for x in data.loc[:n_cv,  'features'].to_list()]
features_flat_list = [item for sublist in features_list for item in sublist]


In [19]:
c = Counter(features_flat_list)

In [20]:
c.most_common()[:40]

[('5:37000001-38000000_ratio', 1217),
 ('8:2000001-3000000_ratio', 980),
 ('2:235000001-236000000_ratio', 948),
 ('9:20000001-21000000_ratio', 868),
 ('6:80000001-81000000_ratio', 864),
 ('12:62000001-63000000_ratio', 820),
 ('5:29000001-30000000_ratio', 818),
 ('2:237000001-238000000_ratio', 768),
 ('2:240000001-241000000_ratio', 733),
 ('5:40000001-41000000_ratio', 725),
 ('8:1-1000000_ratio', 723),
 ('12:34000001-35000000_ratio', 722),
 ('5:23000001-24000000_ratio', 722),
 ('2:228000001-229000000_ratio', 721),
 ('6:1-1000000_ratio', 718),
 ('5:36000001-37000000_ratio', 717),
 ('6:53000001-54000000_ratio', 714),
 ('8:143000001-144000000_ratio', 699),
 ('1:183000001-184000000_ratio', 687),
 ('8:11000001-12000000_ratio', 675),
 ('5:32000001-33000000_ratio', 673),
 ('14:60000001-61000000_ratio', 671),
 ('14:45000001-46000000_ratio', 667),
 ('2:236000001-237000000_ratio', 661),
 ('2:232000001-233000000_ratio', 660),
 ('18:30000001-31000000_ratio', 659),
 ('4:157000001-158000000_ratio', 6

In [21]:
features_name = [x[0] for x in c.most_common()[:200]]

# Train model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [23]:
X = pickle.load(open(Path('..') / 'features' / 'X.pkl', 'rb'))
y = pickle.load(open(Path('..') / 'features' / 'y.pkl', 'rb'))

In [24]:
X_light = X.loc[:, features_name]

In [25]:
lr = LogisticRegression(class_weight='balanced', C=2)
cv_score = cross_val_score(lr, X_light, y, cv=5)

In [26]:
cv_score

array([0.83333333, 0.8       , 0.86206897, 0.86206897, 0.82758621])

In [27]:
np.mean(cv_score)

0.8370114942528735

In [196]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [199]:
svm = RandomForestClassifier(n_estimators=1000)
cv_score = cross_val_score(svm, X_light, y, cv=5)

In [200]:
cv_score

array([0.76666667, 0.7       , 0.82758621, 0.86206897, 0.72413793])

In [194]:
np.mean(cv_score)

0.7822988505747126