In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, precision_score, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv("evgeni_set.csv")
df['age'] = df['age']/110 #нормализация

#Меня не устраивает что  в train_test_split может в тестовую выборку попасть 0 инфарктов или все 100
#поэтому разделю вручную и соеденю после деления обратно
df_heartattack_x = df[df['minf']==1].iloc[:, 1:]
df_heartattack_y = df[df['minf']==1].iloc[:, 0]
df_no_heartattack_x = df[df['minf']==0].iloc[:, 1:]
df_no_heartattack_y = df[df['minf']==0].iloc[:, 0]

df_heartattack_x_train, df_heartattack_x_test, df_heartattack_y_train, df_heartattack_y_test = train_test_split(df_heartattack_x, df_heartattack_y, test_size=0.2)
df_no_heartattack_x_train, df_no_heartattack_x_test, df_no_heartattack_y_train, df_no_heartattack_y_test = train_test_split(df_no_heartattack_x, df_no_heartattack_y, test_size=0.2)

x_train = pd.concat((df_heartattack_x_train, df_no_heartattack_x_train), axis=0)
y_train = pd.concat((df_heartattack_y_train, df_no_heartattack_y_train), axis=0)
x_test = pd.concat((df_heartattack_x_test, df_no_heartattack_x_test), axis=0)
y_test = pd.concat((df_heartattack_y_test, df_no_heartattack_y_test), axis=0)

In [3]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=2, algorithm='ball_tree')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)
print("recall: "+str(recall_score(y_test, y_pred)))
print("precision: "+str(precision_score(y_test, y_pred)))
print("accuracy_score: "+str(accuracy_score(y_test, y_pred)))
print("Доля обектов отнесеных к классу 1 от тестовой выборки: "+str(round((sum(matrix[:, 1])/x_test.shape[0])*100))+'%')

[[759  30]
 [  0  21]]
recall: 1.0
precision: 0.4117647058823529
accuracy_score: 0.9629629629629629
Доля обектов отнесеных к классу 1 от тестовой выборки: 6%


In [4]:
y_train_pred = model.predict(x_train)
confusion_matrix(y_train, y_train_pred)
x_train_for_pipeline = x_train[y_train_pred==1]
y_train_for_pipeline = y_train[y_train_pred==1]
x_test_for_pipeline = x_test[y_pred==1]
y_test_for_pipeline = y_test[y_pred==1]

In [5]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=10, max_features=5,n_estimators=150)
model.fit(x_train_for_pipeline, y_train_for_pipeline)
y_pred_from_pipeline = model.predict(x_test_for_pipeline)
print(confusion_matrix(y_test_for_pipeline, y_pred_from_pipeline))
print('prec: ', precision_score(y_test_for_pipeline, y_pred_from_pipeline))
print('rec: ', recall_score(y_test_for_pipeline, y_pred_from_pipeline))
print('f1: ', f1_score(y_test_for_pipeline, y_pred_from_pipeline))

[[27  3]
 [ 9 12]]
prec:  0.8
rec:  0.5714285714285714
f1:  0.6666666666666666


In [13]:
iter_n_estimators = list(range(100, 900, 50))
iter_max_features = list(range(5, 9))
iter_max_depth = list(range(5, 14))

#все возможные комбинации параметров
a = [iter_n_estimators, iter_max_features, iter_max_depth]
iters = [list(x) for x in np.array(np.meshgrid(*a)).T.reshape(-1,len(a))]#все возможные комбинации параметров

In [20]:
%%time
results = {'iter_n_estimators':[], 'iter_max_features':[], 'iter_max_depth':[],
            'train_recall_score':[], 'train_f1_score':[], 'train_precision_score':[],
            'test_recall_score':[], 'test_f1_score':[], 'test_precision_score':[]}
for i in iters:
    results['iter_n_estimators'].append(i[0])
    results['iter_max_features'].append(i[1])
    results['iter_max_depth'].append(i[2])
    model = RandomForestClassifier(max_depth=i[2], max_features=i[1], n_estimators=i[0])
    model.fit(x_train_for_pipeline, y_train_for_pipeline)
    train_y_pred_from_pipeline = model.predict(x_train_for_pipeline)
    y_pred_from_pipeline = model.predict(x_test_for_pipeline)
    results['train_recall_score'].append(recall_score(y_train_for_pipeline, train_y_pred_from_pipeline))
    results['train_f1_score'].append(f1_score(y_train_for_pipeline, train_y_pred_from_pipeline))
    results['train_precision_score'].append(precision_score(y_train_for_pipeline, train_y_pred_from_pipeline))
    results['test_recall_score'].append(recall_score(y_test_for_pipeline, y_pred_from_pipeline))
    results['test_f1_score'].append(f1_score(y_test_for_pipeline, y_pred_from_pipeline))
    results['test_precision_score'].append(precision_score(y_test_for_pipeline, y_pred_from_pipeline))

CPU times: total: 8min 52s
Wall time: 8min 53s


In [21]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,iter_n_estimators,iter_max_features,iter_max_depth,train_recall_score,train_f1_score,train_precision_score,test_recall_score,test_f1_score,test_precision_score
0,100,5,5,0.64557,0.750000,0.894737,0.631579,0.750000,0.923077
1,100,6,5,0.64557,0.750000,0.894737,0.631579,0.750000,0.923077
2,100,7,5,0.64557,0.750000,0.894737,0.631579,0.750000,0.923077
3,100,8,5,0.56962,0.725806,1.000000,0.578947,0.733333,1.000000
4,150,5,5,0.64557,0.750000,0.894737,0.631579,0.750000,0.923077
...,...,...,...,...,...,...,...,...,...
571,800,8,13,0.56962,0.725806,1.000000,0.578947,0.733333,1.000000
572,850,5,13,0.64557,0.750000,0.894737,0.631579,0.750000,0.923077
573,850,6,13,0.56962,0.725806,1.000000,0.578947,0.733333,1.000000
574,850,7,13,0.64557,0.750000,0.894737,0.631579,0.750000,0.923077


In [8]:
model = RandomForestClassifier(max_depth=5, max_features=5,n_estimators=150)
model.fit(x_train_for_pipeline, y_train_for_pipeline)
prediction = model.predict_proba(x_test_for_pipeline)
prediction_train = model.predict_proba(x_train_for_pipeline)
treshlods = {'threshold':[], 'test_recall_score':[], 'test_f1_score':[], 'test_precision_score':[]
           , 'train_recall_score':[], 'train_f1_score':[], 'train_precision_score':[]}
iters = prediction[:, 1].copy()
iters.sort()

for i in iters:
    predict_mine = []
    for i2 in prediction:
        if i2[1]>i:
            predict_mine.append(1)
        else:
            predict_mine.append(0)

    train_predict_mine = []
    for i2 in prediction_train:
        if i2[1]>i:
            train_predict_mine.append(1)
        else:
            train_predict_mine.append(0)
            
    treshlods['threshold'].append(i)
    treshlods['test_recall_score'].append(recall_score(y_test_for_pipeline, predict_mine))
    treshlods['test_f1_score'].append(f1_score(y_test_for_pipeline, predict_mine))
    treshlods['test_precision_score'].append(precision_score(y_test_for_pipeline, predict_mine))
    treshlods['train_recall_score'].append(recall_score(y_train_for_pipeline, train_predict_mine))
    treshlods['train_f1_score'].append(f1_score(y_train_for_pipeline, train_predict_mine))
    treshlods['train_precision_score'].append(precision_score(y_train_for_pipeline, train_predict_mine))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
df_treshlods = pd.DataFrame(treshlods)
df_treshlods

Unnamed: 0,threshold,test_recall_score,test_f1_score,test_precision_score,train_recall_score,train_f1_score,train_precision_score
0,0.158782,0.857143,0.5625,0.418605,0.952381,0.588235,0.425532
1,0.158782,0.857143,0.5625,0.418605,0.952381,0.588235,0.425532
2,0.158782,0.857143,0.5625,0.418605,0.952381,0.588235,0.425532
3,0.158782,0.857143,0.5625,0.418605,0.952381,0.588235,0.425532
4,0.158782,0.857143,0.5625,0.418605,0.952381,0.588235,0.425532
5,0.158782,0.857143,0.5625,0.418605,0.952381,0.588235,0.425532
6,0.158782,0.857143,0.5625,0.418605,0.952381,0.588235,0.425532
7,0.158782,0.857143,0.5625,0.418605,0.952381,0.588235,0.425532
8,0.158927,0.809524,0.566667,0.435897,0.880952,0.632479,0.493333
9,0.158927,0.809524,0.566667,0.435897,0.880952,0.632479,0.493333


In [10]:
df.to_csv('pipeline-KNC-RFC_results.csv')