# 1. Загрузка данных

## 1.1. Распакуем архив

In [None]:
#!unzip data_input4.zip

## 1.2. Произведем предобработку данных

In [None]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter

In [None]:
root_folder = "data_input4"
all_files = {}
for fld in next(os.walk(root_folder))[1]:
    path = os.path.join(root_folder, fld)
    all_files[fld] = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

In [None]:
measurements = []
classifications = []

count_healhy = 0
count_sick = 0

for sample in list(all_files):
    for txt in all_files[sample]:
        if "DS_Store" not in str(txt):
            df = pd.read_csv(os.path.join(root_folder,sample,txt), sep='\t',skiprows=[0],
                header=None, names=[ 'Wave', 'Intensity'])
        #time_unique = df.Wave.unique()
            if len(df[['Intensity']].to_numpy()) != 994:
                measurements.append(df[['Intensity']].to_numpy())
                if 'healthy' in str(sample):
                    classifications.append([1,0])
                    count_healhy += 1
                if 'sick' in str(sample):
                    classifications.append([0,1])
                    count_sick += 1

print(f'Здоровые ткани: {count_healhy}')
print(f'Больные ткани: {count_sick}')
print(f'Все ткани: {count_healhy + count_sick}')

Загрузим дополнительные данные

In [None]:
add_data = "image_tumor_patient20022019_633nm_obj20_power100_1sec_сenter2900.txt"

add_data = pd.read_csv(add_data, sep='\t',skiprows=[0], header=None)

for i in range(468):
    measurements.append(add_data.loc[i : i + 1014][3].to_numpy().reshape(-1, 1))
    classifications.append([0, 1])
    count_sick += 1

print(f'Здоровые ткани: {count_healhy}')
print(f'Больные ткани: {count_sick}')
print(f'Все ткани: {count_healhy + count_sick}')

# 2. Анализ данных

Подготовим данные

In [None]:
X = np.array(measurements)
y = np.array(classifications)

In [None]:
X.shape

In [None]:
X = X.reshape(1022, 1015)
X.shape

Сделаем чтобы столбцами стали наши длины волны (Raman Shifts)

In [None]:
raman_shifts = pd.read_csv('data_input4/01_healthy/cortex_patient180219_633nm_obj50_power100_1sec_acc40_сenter2900_place7_20200813_1.txt', sep="\t", header=None)
raman_shifts.drop(columns=[2], inplace = True)
raman_shifts.columns = ["Raman Shifts", "Интенсивность"]
raman_shifts.drop(raman_shifts.head(1).index, inplace=True)
raman_shifts["Raman Shifts"] = pd.to_numeric(raman_shifts["Raman Shifts"], errors='coerce').astype(np.int64)

In [None]:
raman_shifts.plot(x = 'Raman Shifts', y = 'Интенсивность', figsize=(15, 5))

In [None]:
raman_shifts = pd.read_csv('data_input4/02_sick/image_tumor_patient20022019_633nm_obj20_power100_1sec_сenter2900_step4um_place1_20200929_1_18.txt', sep="\t", header=None)
raman_shifts.drop(columns=[2], inplace = True)
raman_shifts.columns = ["Raman Shifts", "Интенсивность"]
raman_shifts.drop(raman_shifts.head(1).index, inplace=True)
raman_shifts["Raman Shifts"] = pd.to_numeric(raman_shifts["Raman Shifts"], errors='coerce').astype(np.int64)

In [None]:
raman_shifts.plot(x = 'Raman Shifts', y = 'Интенсивность', figsize=(15, 5), color = 'red')

In [None]:
raman_shifts["Raman Shifts"].value_counts()

In [None]:
RS = np.array(raman_shifts["Raman Shifts"])
RS

Все в единичном экземпляре, отлично

In [None]:
df = pd.DataFrame(data = X)
df.columns = RS
df = df[RS[::-1]]
target = pd.DataFrame(data = y)
df['target'] = target[1]
df

In [None]:
health = df[df['target'] == 0]
health = health.drop(['target'], axis=1)

sick = df[df['target'] == 1]
sick = sick.drop(['target'], axis=1)

In [None]:
H = pd.DataFrame(health.mean())
H['median'] = health.median()
H['percentile_10'] = health.quantile(.10)
H['percentile_90'] = health.quantile(.90)

S = pd.DataFrame(sick.mean())
S['median'] = sick.median()
S['percentile_10'] = sick.quantile(.10)
S['percentile_90'] = sick.quantile(.90)

In [None]:
plt.figure(figsize=(15, 7))

plt.ylabel("Интенсивность", fontsize=20)
plt.xlabel("Raman Shifts", fontsize=20)

plt.title('Сравнение усредненных графиков Рамановской спектроскопии для здоровых и больных тканей')

plt.plot(H.index, H[0], label='Здоровые')
plt.plot(S.index, S[0], label='Больные', color = 'red')

plt.legend(prop={'size': 20})

In [None]:
plt.figure(figsize=(15, 7))

plt.ylabel("Интенсивность", fontsize=20)
plt.xlabel("Raman Shifts", fontsize=20)

plt.title('Сравнение медиан графиков Рамановской спектроскопии для здоровых и больных тканей')

plt.plot(H.index, H['median'], label='Здоровые')
plt.plot(S.index, S['median'], label='Больные', color = 'red')

plt.legend(prop={'size': 20})

In [None]:
plt.figure(figsize=(15, 7))

plt.ylabel("Интенсивность", fontsize=20)
plt.xlabel("Raman Shifts", fontsize=20)

plt.title('Здоровые ткани')

plt.plot(H.index, H['percentile_90'], label='90% персцентиль', color = 'navy')
plt.plot(H.index, H[0], label='Среднее', color = 'blue')
plt.plot(H.index, H['median'], label='Медиана', color = 'royalblue')
plt.plot(H.index, H['percentile_10'], label='10% персцентиль', color = 'cornflowerblue')

plt.legend(prop={'size': 25})

In [None]:
plt.figure(figsize=(15, 7))

plt.ylabel("Интенсивность", fontsize=20)
plt.xlabel("Raman Shifts", fontsize=20)

plt.title('Больные ткани')

plt.plot(S.index, S['percentile_90'], label='90% персцентиль', color = 'brown')
plt.plot(S.index, S[0], label='Среднее', color = 'red')
plt.plot(S.index, S['median'], label='Медиана', color = 'orangered')
plt.plot(S.index, S['percentile_10'], label='10% персцентиль', color = 'lightcoral')

plt.legend(prop={'size': 25})

In [None]:
plt.figure(figsize=(15, 7))

plt.ylabel("Интенсивность", fontsize=20)
plt.xlabel("Raman Shifts", fontsize=20)

plt.title('Здоровые ткани')

plt.plot(H.index, H['percentile_90'], label='90% персцентиль здоровых', color = 'blue')
plt.plot(H.index, H[0], label='Среднее')
#plt.plot(H.index, H['median'], label='Медиана')
#plt.plot(H.index, H['percentile_10'], label='10% персцентиль здоровых', color = 'cornflowerblue')

plt.plot(S.index, S['percentile_90'], label='90% персцентиль больных', color = 'red')
#plt.plot(S.index, S[0], label='Среднее')
#plt.plot(S.index, S['median'], label='Медиана')
plt.plot(S.index, S['percentile_10'], label='10% персцентиль больных', color = 'lightcoral')

plt.legend(prop={'size': 17})

In [None]:
df['target'].value_counts()

# 3. Classic ML

In [None]:
X = df.drop(['target'], axis=1)
y = df['target']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f'Total amount of train measurements: {X_train.shape}')
print(f'Total amount of train labels: {y_train.shape}')
print(f'Total amount of test measurements: {X_test.shape}')
print(f'Total amount of test labels: {y_test.shape}')

## 3.1 Logistic regression

In [None]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [None]:
LogReg = LogisticRegression(solver='lbfgs', max_iter=100000)
LogReg.fit(X_train, y_train)

In [None]:
y_pred_lr = LogReg.predict(X_test)

print(f'Logistic Regression F1 Score {f1_score(y_test, y_pred_lr)}')
print(f'Logistic Regression Accuracy {accuracy_score(y_test, y_pred_lr)}')

In [None]:
y_test.value_counts()

In [None]:
from sklearn.metrics import confusion_matrix 

cm_lr = pd.DataFrame(confusion_matrix(y_test, y_pred_lr))
cm_lr

In [None]:
sensitivity = cm_lr[0][0] / (cm_lr[0][0] + cm_lr[1][0])
specificity = cm_lr[1][1] / (cm_lr[0][1] + cm_lr[1][1])

print('Доля верно обнаруженных здоровых : ', sensitivity )
print('Доля верно обнаруженных больных : ', specificity)

In [None]:
LR_metric = [accuracy_score(y_test, y_pred_lr), f1_score(y_test, y_pred_lr), specificity, sensitivity]
LR_metric

## 3.2 Support vector machine (SVM)

In [None]:
from sklearn.svm import SVC

SVM_rbf = SVC(kernel='rbf')
SVM_rbf.fit(X_train, y_train)

In [None]:
y_pred_svm_rbf = SVM_rbf.predict(X_test)

print(f'Rbf SVM F1 Score {f1_score(y_test, y_pred_svm_rbf)}')
print(f'Rbf SVM Accuracy {accuracy_score(y_test, y_pred_svm_rbf)}')

Попробуем линейный SVM

In [None]:
SVM_lnr = SVC(kernel='linear')
SVM_lnr.fit(X_train, y_train)

In [None]:
y_pred_svm_lnr = SVM_lnr.predict(X_test)

print(f'Linear SVM F1 Score {f1_score(y_test, y_pred_svm_lnr)}')
print(f'Linear SVM Accuracy {accuracy_score(y_test, y_pred_svm_lnr)}')

In [None]:
cm_svm_lnr = pd.DataFrame(confusion_matrix(y_test, y_pred_svm_lnr))
cm_svm_lnr

In [None]:
sensitivity = cm_svm_lnr[0][0] / (cm_svm_lnr[0][0] + cm_svm_lnr[1][0])
specificity = cm_svm_lnr[1][1] / (cm_svm_lnr[0][1] + cm_svm_lnr[1][1])

print('Доля верно обнаруженных здоровых : ', sensitivity )
print('Доля верно обнаруженных больных : ', specificity)

In [None]:
SVM_metric = [accuracy_score(y_test, y_pred_svm_lnr), f1_score(y_test, y_pred_svm_lnr), specificity, sensitivity]
SVM_metric

## 3.3 Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()
RF.fit(X_train, y_train)

In [None]:
y_pred_rf = RF.predict(X_test)

print(f'Random forest F1 Score {f1_score(y_test, y_pred_rf)}')
print(f'Random forest Accuracy {accuracy_score(y_test, y_pred_rf)}')

In [None]:
cm_rf = pd.DataFrame(confusion_matrix(y_test, y_pred_rf))
cm_rf

In [None]:
sensitivity = cm_rf[0][0] / (cm_rf[0][0] + cm_rf[1][0])
specificity = cm_rf[1][1] / (cm_rf[0][1] + cm_rf[1][1])

print('Доля верно обнаруженных здоровых : ', sensitivity )
print('Доля верно обнаруженных больных : ', specificity)

In [None]:
RF_metric = [accuracy_score(y_test, y_pred_rf), f1_score(y_test, y_pred_rf), specificity, sensitivity]
RF_metric

## 3.4 Catboost

In [None]:
from catboost import CatBoostClassifier

CatBoost = CatBoostClassifier(verbose=False)
CatBoost.fit(X_train, y_train)

In [None]:
y_pred_cb = CatBoost.predict(X_test)

print(f'CatBoost F1 Score {f1_score(y_test, y_pred_cb)}')
print(f'CatBoost Accuracy {accuracy_score(y_test, y_pred_cb)}')

In [None]:
cm_cb = pd.DataFrame(confusion_matrix(y_test, y_pred_cb))
cm_cb

In [None]:
sensitivity = cm_cb[0][0] / (cm_cb[0][0] + cm_cb[1][0])
specificity = cm_cb[1][1] / (cm_cb[0][1] + cm_cb[1][1])

print('Доля верно обнаруженных здоровых : ', sensitivity )
print('Доля верно обнаруженных больных : ', specificity)

In [None]:
CB_metric = [accuracy_score(y_test, y_pred_cb), f1_score(y_test, y_pred_cb), specificity, sensitivity]
CB_metric

## 3.5 Ансамбль моделей

Дадим CatBoost и RF вес 2, как лучшим моделям

In [None]:
ensemble = pd.DataFrame(y_test)

ensemble['sum'] = y_pred_lr * 1 + y_pred_svm_lnr * 1 + y_pred_rf * 2 + y_pred_cb * 2
ensemble['target_pred'] = ensemble['sum'].apply ( lambda x: 0 if x < 0.5 else 1)

ensemble

In [None]:
f1 = f1_score(ensemble['target'], ensemble['target_pred'])
acc = accuracy_score(ensemble['target'], ensemble['target_pred'])

print(f'Ensemble F1 Score {f1}')
print(f'Ensemble Accuracy {acc}')

In [None]:
cm_ens = pd.DataFrame(confusion_matrix(ensemble['target'], ensemble['target_pred']))
cm_ens

In [None]:
sensitivity = cm_ens[0][0] / (cm_ens[0][0] + cm_ens[1][0])
specificity = cm_ens[1][1] / (cm_ens[0][1] + cm_ens[1][1])

print('Доля верно обнаруженных здоровых : ', sensitivity )
print('Доля верно обнаруженных больных : ', specificity)

In [None]:
ENS_1_metric = [acc, f1, specificity, sensitivity]
ENS_1_metric

выберем порог < 1.5

In [None]:
ensemble = pd.DataFrame(y_test)

ensemble['sum'] = y_pred_lr * 1 + y_pred_svm_lnr * 1 + y_pred_rf * 2 + y_pred_cb * 2
ensemble['target_pred'] = ensemble['sum'].apply ( lambda x: 0 if x < 1.5 else 1)

ensemble

In [None]:
f1 = f1_score(ensemble['target'], ensemble['target_pred'])
acc = accuracy_score(ensemble['target'], ensemble['target_pred'])

print(f'Ensemble F1 Score {f1}')
print(f'Ensemble Accuracy {acc}')

In [None]:
cm_ens = pd.DataFrame(confusion_matrix(ensemble['target'], ensemble['target_pred']))
cm_ens

In [None]:
sensitivity = cm_ens[0][0] / (cm_ens[0][0] + cm_ens[1][0])
specificity = cm_ens[1][1] / (cm_ens[0][1] + cm_ens[1][1])

print('Доля верно обнаруженных здоровых : ', sensitivity )
print('Доля верно обнаруженных больных : ', specificity)

In [None]:
ENS_2_metric = [acc, f1, specificity, sensitivity]
ENS_2_metric

выберем порог < 2.5

In [None]:
ensemble = pd.DataFrame(y_test)

ensemble['sum'] = y_pred_lr * 1 + y_pred_svm_lnr * 1 + y_pred_rf * 2 + y_pred_cb * 2
ensemble['target_pred'] = ensemble['sum'].apply ( lambda x: 0 if x < 2.5 else 1)

ensemble

In [None]:
f1 = f1_score(ensemble['target'], ensemble['target_pred'])
acc = accuracy_score(ensemble['target'], ensemble['target_pred'])

print(f'Ensemble F1 Score {f1}')
print(f'Ensemble Accuracy {acc}')

In [None]:
cm_ens = pd.DataFrame(confusion_matrix(ensemble['target'], ensemble['target_pred']))
cm_ens

In [None]:
sensitivity = cm_ens[0][0] / (cm_ens[0][0] + cm_ens[1][0])
specificity = cm_ens[1][1] / (cm_ens[0][1] + cm_ens[1][1])

print('Доля верно обнаруженных здоровых : ', sensitivity )
print('Доля верно обнаруженных больных : ', specificity)

In [None]:
ENS_3_metric = [acc, f1, specificity, sensitivity]
ENS_3_metric

Выберем порог < 3.5

In [None]:
ensemble = pd.DataFrame(y_test)

ensemble['sum'] = y_pred_lr * 1 + y_pred_svm_lnr * 1 + y_pred_rf * 2 + y_pred_cb * 2
ensemble['target_pred'] = ensemble['sum'].apply ( lambda x: 0 if x < 3.5 else 1)

ensemble

In [None]:
f1 = f1_score(ensemble['target'], ensemble['target_pred'])
acc = accuracy_score(ensemble['target'], ensemble['target_pred'])

print(f'Ensemble F1 Score {f1}')
print(f'Ensemble Accuracy {acc}')

In [None]:
cm_ens = pd.DataFrame(confusion_matrix(ensemble['target'], ensemble['target_pred']))
cm_ens

In [None]:
sensitivity = cm_ens[0][0] / (cm_ens[0][0] + cm_ens[1][0])
specificity = cm_ens[1][1] / (cm_ens[0][1] + cm_ens[1][1])

print('Доля верно обнаруженных здоровых : ', sensitivity )
print('Доля верно обнаруженных больных : ', specificity)

In [None]:
ENS_4_metric = [acc, f1, specificity, sensitivity]
ENS_4_metric

выберем порог < 4.5

In [None]:
ensemble = pd.DataFrame(y_test)

ensemble['sum'] = y_pred_lr * 1 + y_pred_svm_lnr * 1 + y_pred_rf * 2 + y_pred_cb * 2
ensemble['target_pred'] = ensemble['sum'].apply ( lambda x: 0 if x < 4.5 else 1)

ensemble

In [None]:
f1 = f1_score(ensemble['target'], ensemble['target_pred'])
acc = accuracy_score(ensemble['target'], ensemble['target_pred'])

print(f'Ensemble F1 Score {f1}')
print(f'Ensemble Accuracy {acc}')

In [None]:
cm_ens = pd.DataFrame(confusion_matrix(ensemble['target'], ensemble['target_pred']))
cm_ens

In [None]:
sensitivity = cm_ens[0][0] / (cm_ens[0][0] + cm_ens[1][0])
specificity = cm_ens[1][1] / (cm_ens[0][1] + cm_ens[1][1])

print('Доля верно обнаруженных здоровых : ', sensitivity )
print('Доля верно обнаруженных больных : ', specificity)

In [None]:
ENS_5_metric = [acc, f1, specificity, sensitivity]
ENS_5_metric

## 3.6 Результаты

In [None]:
res_all = pd.DataFrame()

res_all['LR'] = LR_metric
res_all['SVM'] = SVM_metric
res_all['RF_all'] = RF_metric
res_all['CB'] = CB_metric
res_all['ENS_1'] = ENS_1_metric
res_all['ENS_2'] = ENS_2_metric
res_all['ENS_3'] = ENS_3_metric
res_all['ENS_4'] = ENS_4_metric
res_all['ENS_5'] = ENS_5_metric

res_all.index = ['Точность', 'F1 мера', 'Доля верно обнаруженных больных', 'Доля верно обнаруженных здоровых']
res_all = res.round(3)
res_all

In [None]:
res_classic_ml = res_all

In [None]:
res_classic_ml.to_csv('results/results_classic_ml.csv')  

# 4. Кластеризация без учителя

## 4.1. TSNE

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

model = TSNE(learning_rate=100)

transformed = model.fit_transform(X)

# Представляем результат в двумерных координатах
x_axis = transformed[:, 0]
y_axis = transformed[:, 1]

plt.scatter(x_axis, y_axis, c=y)
plt.show()

## 4.2. K-means

In [None]:
from sklearn.cluster import KMeans

acc_sc = pd.Series()
f1_scr = pd.Series()
sensit = pd.Series()
specif = pd.Series()

iter = 50

for i in range (2, iter):
  model = KMeans(n_clusters=i)
  model.fit(X)

  all_predictions = model.predict(X)
  A = pd.DataFrame(all_predictions)
  A[1] = y

  K1 = A[A[1] == 1]
  s1 = set(K1[0])

  K0 = A[A[1] == 0]
  s0 = set(K0[0])

  A['pred'] = A[0].apply(lambda x: 0 if ((x in s1) == False) else
                                   1 if ((x in s0) == False) else
                                   0 if (A.groupby([1, 0]).value_counts()[0, x] > A.groupby([1, 0]).value_counts()[1, x]) else 1)
  
  acc = accuracy_score(A[1], A['pred'])
  f1 = f1_score(A[1], A['pred'])

  cm = pd.DataFrame(confusion_matrix(A[1], A['pred']))
  sensitivity = cm[0][0] / (cm[0][0] + cm[1][0])
  specificity = cm[1][1] / (cm[0][1] + cm[1][1])

  acc_sc[str(i)] = round(acc, 3)
  f1_scr[str(i)] = round(f1, 3)
  sensit[str(i)] = round(sensitivity, 3)
  specif[str(i)] = round(specificity, 3)

In [None]:
Res = pd.DataFrame()
Res['Точность'] = acc_sc
Res['F1 мера'] = f1_scr
Res['Доля верно обнаруженных здоровых'] = sensit
Res['Доля верно обнаруженных больных'] = specif

In [None]:
plt.figure(figsize=(20, 10))
#plt.xticks(rotation=30)

plt.ylabel("Oценка", fontsize=20)
plt.xlabel("Количество кластеров в K-means", fontsize=20)

plt.title('Оценка метрик в зависимости от количества кластеров в K-means')

plt.plot(Res.index, Res['F1 мера'], label='F1 мера')
plt.plot(Res.index, Res['Точность'], label='Точность')
plt.plot(Res.index, Res['Доля верно обнаруженных здоровых'], label='Доля верно обнаруженных здоровых')
plt.plot(Res.index, Res['Доля верно обнаруженных больных'], label='Доля верно обнаруженных больных')

plt.legend(prop={'size': 20})

In [None]:
Res.sort_values(by='F1 мера', ascending=False)

In [None]:
from sklearn.cluster import KMeans

acc_sc = pd.Series()
f1_scr = pd.Series()
sensit = pd.Series()
specif = pd.Series()

iter = 50

for i in range (15, iter):

  acc = 0
  f1 = 0
  sensitivity = 0
  specificity = 0

  for j in range (10):
    model = KMeans(n_clusters=i)
    model.fit(X)

    all_predictions = model.predict(X)
    A = pd.DataFrame(all_predictions)
    A[1] = y

    K1 = A[A[1] == 1]
    s1 = set(K1[0])

    K0 = A[A[1] == 0]
    s0 = set(K0[0])

    A['pred'] = A[0].apply(lambda x: 0 if ((x in s1) == False) else
                                     1 if ((x in s0) == False) else
                                     0 if (A.groupby([1, 0]).value_counts()[0, x] > A.groupby([1, 0]).value_counts()[1, x]) else 1)
  
    acc += accuracy_score(A[1], A['pred'])
    f1 += f1_score(A[1], A['pred'])

    cm = pd.DataFrame(confusion_matrix(A[1], A['pred']))
    sensitivity += cm[0][0] / (cm[0][0] + cm[1][0])
    specificity += cm[1][1] / (cm[0][1] + cm[1][1])

  acc_sc[str(i)] = round(acc / 10, 3)
  f1_scr[str(i)] = round(f1 / 10, 3)
  sensit[str(i)] = round(sensitivity / 10, 3)
  specif[str(i)] = round(specificity / 10, 3)

In [None]:
Res = pd.DataFrame()
Res['Точность'] = acc_sc
Res['F1 мера'] = f1_scr
Res['Доля верно обнаруженных здоровых'] = sensit
Res['Доля верно обнаруженных больных'] = specif

In [None]:
plt.figure(figsize=(20, 10))
#plt.xticks(rotation=30)

plt.ylabel("Oценка", fontsize=20)
plt.xlabel("Количество кластеров в K-means", fontsize=20)

plt.title('Оценка метрик в зависимости от количества кластеров в K-means')

plt.plot(Res.index, Res['F1 мера'], label='F1 мера')
plt.plot(Res.index, Res['Точность'], label='Точность')
plt.plot(Res.index, Res['Доля верно обнаруженных здоровых'], label='Доля верно обнаруженных здоровых')
plt.plot(Res.index, Res['Доля верно обнаруженных больных'], label='Доля верно обнаруженных больных')

plt.legend(prop={'size': 20})

In [None]:
Res.sort_values(by='F1 мера', ascending=False).head(20)

Выводы

In [None]:
Best_3 = Res.sort_values(by='F1 мера', ascending=False).head(3)
Best_3 = Best_3.T
Best_3

In [None]:
Best_3.columns = ['K_means_1', 'K_means_2', 'K_means_3']
res_all['K_means_1'] = Best_3['K_means_1']
res_all['K_means_2'] = Best_3['K_means_2']
res_all['K_means_3'] = Best_3['K_means_3']

res_all = res_all.round(3)
res_all

In [None]:
res_K_means = res_all.drop(columns= ['LR', 'SVM','RF_all','CB', 'ENS_1', 'ENS_2', 'ENS_3', 'ENS_4', 'ENS_5'])

In [None]:
res_K_means

In [None]:
res_K_means.to_csv('results/result_K_means.csv') 

# 5. Отбор признаков

## 5.1. Взаимная информация

Отберем 20 самых информативных Raman Shift

In [None]:
from sklearn.feature_selection import mutual_info_classif

importances = mutual_info_classif(X, y)
# Где data - ваш датасет; X, y – входные и выходные данные соответственно
feature_importances = pd.Series(importances, df.columns[0:len(df.columns)-1])
feature_importances = pd.DataFrame(feature_importances)
feature_importances.columns = ['Importance']
feature_importances['Raman Shift'] = feature_importances.index
feature_importances.plot(x = 'Raman Shift', y = 'Importance', figsize=(20, 10))

In [None]:
feature_importances = pd.DataFrame(feature_importances)
feature_importances = feature_importances.sort_values(by=['Importance'], ascending=False)
feature_importances.reset_index(inplace = True)
feature_importances.drop('index', axis=1, inplace = True)
feature_importances

Напишем функцию для отбора n лучших признаков

In [None]:
def df_best_n_features(n):

  n_features = feature_importances.head(n)
  best_n_ftr_list = list(n_features['Raman Shift'])

  df_t = df.T
  df_best_n_features = df_t.loc[best_n_ftr_list]
  df_best_n_features = df_best_n_features.T
  df_best_n_features['target'] = df.target

  return(df_best_n_features)

Оставим лучшие 50 признаков

In [None]:
df_best_50_features = df_best_n_features(50)
df_best_50_features

In [None]:
X_50 = df_best_50_features.drop(['target'], axis=1)
y_50 = df_best_50_features['target']

X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X_50, y_50, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()
RF.fit(X_train_50, y_train_50)

y_pred_rf_50 = RF.predict(X_test_50)

print(f'Random forest F1 Score на 50 признаках {f1_score(y_test_50, y_pred_rf_50)}')
print(f'Random forest Accuracy на 50 признаках {accuracy_score(y_test_50, y_pred_rf_50)}')

Оставим лучшие 20 признаков

In [None]:
df_best_20_features = df_best_n_features(20)
df_best_20_features

In [None]:
X_20 = df_best_20_features.drop(['target'], axis=1)
y_20 = df_best_20_features['target']

X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(X_20, y_20, test_size=0.2, random_state=42)

In [None]:
RF.fit(X_train_20, y_train_20)

y_pred_rf_20 = RF.predict(X_test_20)

print(f'Random forest F1 Score на 20 признаках {f1_score(y_test_20, y_pred_rf_20)}')
print(f'Random forest Accuracy на 20 признаках {accuracy_score(y_test_20, y_pred_rf_20)}')

Оставим лучшие 10 признаков

In [None]:
df_best_10_features = df_best_n_features(10)
df_best_10_features

In [None]:
X_10 = df_best_10_features.drop(['target'], axis=1)
y_10 = df_best_10_features['target']

X_train_10, X_test_10, y_train_10, y_test_10 = train_test_split(X_10, y_10, test_size=0.2, random_state=42)

In [None]:
RF.fit(X_train_10, y_train_10)

y_pred_rf_10 = RF.predict(X_test_10)

print(f'Random forest F1 Score на 10 признаках {f1_score(y_test_10, y_pred_rf_10)}')
print(f'Random forest Accuracy на 10 признаках {accuracy_score(y_test_10, y_pred_rf_10)}')

Оставим лучшие 5 признаков

In [None]:
df_best_5_features = df_best_n_features(5)
df_best_5_features

In [None]:
X_5 = df_best_5_features.drop(['target'], axis=1)
y_5 = df_best_5_features['target']

X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X_5, y_5, test_size=0.2, random_state=42)

In [None]:
RF.fit(X_train_5, y_train_5)

y_pred_rf_5 = RF.predict(X_test_5)

print(f'Random forest F1 Score на 5 признаках {f1_score(y_test_5, y_pred_rf_5)}')
print(f'Random forest Accuracy на 5 признаках {accuracy_score(y_test_5, y_pred_rf_5)}')

Напишем функцию для вывода оценки метрики по количеству лучших признаков

In [None]:
def score_from_n_to_m_best_features(n, m):

  bst_features = pd.DataFrame(index=range(n, m + 1),columns=['Random forest F1 Score', 'Random forest Accuracy'])

  for i in range (n, m + 1):

    f1_score_i = 0
    accuracy_i = 0

    df_best_i_features = df_best_n_features(i)

    for j in range (10):
      X_i = df_best_i_features.drop(['target'], axis=1)
      y_i = df_best_i_features['target']
      X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_i, y_i, test_size=0.2, random_state=42)

      RF.fit(X_train_i, y_train_i)
      y_pred_rf_i = RF.predict(X_test_i)
      f1_score_i += f1_score(y_test_i, y_pred_rf_i)
      accuracy_i += accuracy_score(y_test_i, y_pred_rf_i)

    bst_features['Random forest F1 Score'][i] = round(f1_score_i / 10, 3)
    bst_features['Random forest Accuracy'][i] = round(accuracy_i / 10, 3)

  return(bst_features)

Рассмотрим максимум 100 лучших признаков

In [None]:
bst_features_1_100 = score_from_n_to_m_best_features(1, 100)
bst_features_1_100

In [None]:
plt.figure(figsize=(20, 10))
#plt.xticks(rotation=30)

plt.ylabel("Oценка Random Forest", fontsize=20)
plt.xlabel("Количество отобранных признаков", fontsize=20)

plt.title('Оценка метрик в зависимости от признаков')

plt.plot(bst_features_1_100.index, bst_features_1_100['Random forest F1 Score'], label='Random Forest F1 Score')
plt.plot(bst_features_1_100.index, bst_features_1_100['Random forest Accuracy'], label='Random Forest Accuracy')

plt.legend(prop={'size': 30})

Сократим с 1-100 до 5-30

In [None]:
bst_features_5_30 = score_from_n_to_m_best_features(5, 30)
bst_features_5_30

In [None]:
plt.figure(figsize=(11, 4))
#plt.xticks(rotation=30)

plt.ylabel("Oценка")
plt.xlabel("Количество отобранных признаков")

plt.title('Оценка метрик в зависимости от признаков')

plt.plot(bst_features_5_30.index, bst_features_5_30['Random forest F1 Score'], label='Random forest F1 Score')
plt.plot(bst_features_5_30.index, bst_features_5_30['Random forest Accuracy'], label='Random forest Accuracy')

plt.legend()

Сократим с 5-30 до 5-15

In [None]:
bst_features_5_15 = score_from_n_to_m_best_features(5, 15)
bst_features_5_15

In [None]:
plt.figure(figsize=(20, 10))
#plt.xticks(rotation=30)

plt.ylabel("Oценка Random Forest", fontsize=20)
plt.xlabel("Количество отобранных признаков", fontsize=20)

plt.title('Оценка метрик в зависимости от признаков')

plt.plot(bst_features_5_15.index, bst_features_5_15['Random forest F1 Score'], label='Random Forest F1 Score')
plt.plot(bst_features_5_15.index, bst_features_5_15['Random forest Accuracy'], label='Random Forest Accuracy')

plt.legend(prop={'size': 30})

Видно падение на 7 поэтому рассмотрим 6-12

In [None]:
bst_features_6_12 = score_from_n_to_m_best_features(6, 12)
bst_features_6_12

In [None]:
plt.figure(figsize=(11, 4))
#plt.xticks(rotation=30)

plt.ylabel("Oценка")
plt.xlabel("Количество отобранных признаков")

plt.title('Оценка метрик в зависимости от признаков')

plt.plot(bst_features_6_12.index, bst_features_6_12['Random forest F1 Score'], label='Random forest F1 Score')
plt.plot(bst_features_6_12.index, bst_features_6_12['Random forest Accuracy'], label='Random forest Accuracy')

plt.legend()

Проведя серию экспериментов, с усредненными результатами, видно что идеальным выбором будет 7 либо 10 признаков

Рассмотрим метрики на 7 и 10 признаках

In [None]:
df_best_7_features = df_best_n_features(7)
df_best_7_features

In [None]:
X_7 = df_best_7_features.drop(['target'], axis=1)
y_7 = df_best_7_features['target']

X_train_7, X_test_7, y_train_7, y_test_7 = train_test_split(X_7, y_7, test_size=0.2, random_state=42)

In [None]:
RF.fit(X_train_7, y_train_7)
y_pred_rf_7 = RF.predict(X_test_7)

cm_rf_7 = pd.DataFrame(confusion_matrix(y_test, y_pred_rf_7))
sensitivity = cm_rf_7[0][0] / (cm_rf_7[0][0] + cm_rf_7[1][0])
specificity = cm_rf_7[1][1] / (cm_rf_7[0][1] + cm_rf_7[1][1])

RF_7_metric = [accuracy_score(y_test_7, y_pred_rf_7), f1_score(y_test_7, y_pred_rf_7), specificity, sensitivity]
RF_7_metric

In [None]:
RF.fit(X_train_10, y_train_10)
y_pred_rf_10 = RF.predict(X_test_10)

cm_rf_10 = pd.DataFrame(confusion_matrix(y_test, y_pred_rf_10))
sensitivity = cm_rf_10[0][0] / (cm_rf_10[0][0] + cm_rf_10[1][0])
specificity = cm_rf_10[1][1] / (cm_rf_10[0][1] + cm_rf_10[1][1])

RF_10_metric = [accuracy_score(y_test_10, y_pred_rf_10), f1_score(y_test_10, y_pred_rf_10), specificity, sensitivity]
RF_10_metric

In [None]:
res_all['RF_best_7'] = RF_7_metric
res_all['RF_best_10'] = RF_10_metric

res_all = res.round(3)
res_all

In [None]:
res_RF_best = res_all.drop(columns= ['LR', 'SVM', 'CB',	'ENS_1', 'ENS_2', 'ENS_3', 'ENS_4',	'ENS_5',	'K_means_1',	'K_means_2',	'K_means_3'])

In [None]:
res_RF_best.to_csv('results/result_RF_best.csv') 

## 5.2. ExhaustiveFeatureSelector

Рассмотрим все комбинации от 6 до 8 из 10 самых информативных Raman Shift

In [None]:
import joblib
import sklearn

sklearn.externals.joblib = joblib

print(sklearn.externals.joblib)

In [None]:
import sys

sys.modules['sklearn.externals.joblib'] = joblib

In [None]:
df_best_10_features = df_best_n_features(10)

X_10 = df_best_10_features.drop(['target'], axis=1)
y_10 = df_best_10_features['target']

X_train_10, X_test_10, y_train_10, y_test_10 = train_test_split(X_10, y_10, test_size=0.2, random_state=42)

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.ensemble import GradientBoostingClassifier

# создаем ExhaustiveFeatureSeLlector объект.
efs = ExhaustiveFeatureSelector(RandomForestClassifier(),
        min_features=6,
        max_features=8,
        scoring='f1',
        cv=2)

efs = efs.fit(X_train_10, y_train_10)

# выводим выбранные признаки
selected_features = X_train_10.columns[list(efs.best_idx_)]
print(selected_features)

In [None]:
df_t = df.T
df_efs = df_t.loc[selected_features]
df_efs = df_efs.T
df_efs['target'] = df.target
df_efs

In [None]:
X_efs1 = df_efs.drop(['target'], axis=1)
y_efs1 = df_efs['target']

X_efs1_train, X_efs1_test, y_efs1_train, y_efs1_test = train_test_split(X_efs1, y_efs1, test_size=0.2, random_state=42)

In [None]:
RF = RandomForestClassifier()
RF.fit(X_efs1_train, y_efs1_train)
y_pred_efs1 = RF.predict(X_efs1_test)

cm_rf_efs = pd.DataFrame(confusion_matrix(y_efs1_test, y_pred_efs1))
sensitivity = cm_rf_efs[0][0] / (cm_rf_efs[0][0] + cm_rf_efs[1][0])
specificity = cm_rf_efs[1][1] / (cm_rf_efs[0][1] + cm_rf_efs[1][1])

RF_EFS_1_metric = [accuracy_score(y_efs1_test, y_pred_efs1), f1_score(y_efs1_test, y_pred_efs1), specificity, sensitivity]
RF_EFS_1_metric

рассмотрим все комбинации от 8 до 10 из 12 самых информативных Raman Shift

In [None]:
df_best_12_features = df_best_n_features(12)

X_12 = df_best_12_features.drop(['target'], axis=1)
y_12 = df_best_12_features['target']

X_train_12, X_test_12, y_train_12, y_test_12 = train_test_split(X_12, y_12, test_size=0.2, random_state=42)

In [None]:
# создаем ExhaustiveFeatureSeLlector объект.
efs = ExhaustiveFeatureSelector(RandomForestClassifier(),
        min_features=8,
        max_features=10,
        scoring='roc_auc',
        cv=2)

efs = efs.fit(X_train_12, y_train_12)

# выводим выбранные признаки
selected_features = X_train_12.columns[list(efs.best_idx_)]
print(selected_features)

In [None]:
feature_importances.head(12)

In [None]:
df_t = df.T
df_efs = df_t.loc[selected_features]
df_efs = df_efs.T
df_efs['target'] = df.target
df_efs

In [None]:
X_efs2 = df_efs.drop(['target'], axis=1)
y_efs2 = df_efs['target']

X_efs2_train, X_efs2_test, y_efs2_train, y_efs2_test = train_test_split(X_efs2, y_efs2, test_size=0.2, random_state=42)

In [None]:
RF.fit(X_efs2_train, y_efs2_train)
y_pred_efs2 = RF.predict(X_efs2_test)

cm_rf_efs = pd.DataFrame(confusion_matrix(y_efs2_test, y_pred_efs2))
sensitivity = cm_rf_efs[0][0] / (cm_rf_efs[0][0] + cm_rf_efs[1][0])
specificity = cm_rf_efs[1][1] / (cm_rf_efs[0][1] + cm_rf_efs[1][1])

RF_EFS_2_metric = [accuracy_score(y_efs2_test, y_pred_efs2), f1_score(y_efs2_test, y_pred_efs2), specificity, sensitivity]
RF_EFS_2_metric

In [None]:
res_all['RF_EFS_1_6'] = RF_EFS_1_metric
res_all['RF_EFS_2_9'] = RF_EFS_2_metric

res_all = res.round(3)
res_all

## 5.3. RFE

In [None]:
from sklearn import preprocessing

from sklearn.feature_selection import RFE

In [None]:
def rfe_from_n_to_m_best_features_on_k_features(n, m, k):

  rfe_bst_features = pd.DataFrame(index=range(n, m + 1),columns=['RFE F1 Score', 'RFE Accuracy'])

  df_best_k_features = df_best_n_features(k)

  X_k = df_best_k_features.drop(['target'], axis=1)
  y_k = df_best_k_features['target']

  X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X_k, y_k, test_size=0.2, random_state=42)

  for i in range (n, m + 1):

    _lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', random_state=42, n_jobs=-1, max_iter=10)

    rfe = RFE(_lr, n_features_to_select=i)
    rfe.fit(X_train_k, y_train_k)
    # X_train, y_train - входные и выходные данные с обучающей выборки соответственно.
    y_pred_rfe = rfe.predict(X_test_k)

    f1_score_i = f1_score(y_test_k, y_pred_rfe)
    accuracy_i = accuracy_score(y_test_k, y_pred_rfe)

    #rfe_bst_features['RFE F1 Score'][i] = round(f1_score_i, 3)
    #rfe_bst_features['RFE Accuracy'][i] = round(accuracy_i, 3)

    rfe_bst_features['RFE F1 Score'][i] = f1_score_i
    rfe_bst_features['RFE Accuracy'][i] = accuracy_i
  return(rfe_bst_features)

In [None]:
bst_features_1_50 = rfe_from_n_to_m_best_features_on_k_features(1, 50, 50)
bst_features_1_50

In [None]:
plt.figure(figsize=(20, 10))
#plt.xticks(rotation=30)

plt.ylabel("Oценка Random Forest", fontsize=20)
plt.xlabel("Количество отобранных признаков", fontsize=20)

plt.title('Оценка метрик в зависимости от признаков')

plt.plot(bst_features_1_50.index, bst_features_1_50['RFE F1 Score'], label='RFE F1 Score')
plt.plot(bst_features_1_50.index, bst_features_1_50['RFE Accuracy'], label='RFE Accuracy')

plt.legend(prop={'size': 30})

In [None]:
bst_features_5_30 = rfe_from_n_to_m_best_features_on_k_features(3, 30, 50)
bst_features_5_30

In [None]:
plt.figure(figsize=(20, 10))
#plt.xticks(rotation=30)

plt.ylabel("Oценка Random Forest", fontsize=20)
plt.xlabel("Количество отобранных признаков", fontsize=20)


plt.title('Оценка метрик в зависимости от признаков')

plt.plot(bst_features_5_30.index, bst_features_5_30['RFE F1 Score'], label='RFE F1 Score')
plt.plot(bst_features_5_30.index, bst_features_5_30['RFE Accuracy'], label='RFE Accuracy')

plt.legend(prop={'size': 30})

Теперь посмотрим какие признаки встречались чаще всего

In [None]:
def get_best_features(n, m, k):

  get_bst_features = pd.DataFrame(0, index=range(1, k + 1), columns=['Sum'])

  df_best_k_features = df_best_n_features(k)
  X_k = df_best_k_features.drop(['target'], axis=1)
  y_k = df_best_k_features['target']

  X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X_k, y_k, test_size=0.2, random_state=42)

  for i in range (n, m + 1):

    _lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', random_state=42, n_jobs=-1, max_iter=10)

    rfe = RFE(_lr, n_features_to_select=i)
    rfe.fit(X_train_k, y_train_k)
    # X_train, y_train - входные и выходные данные с обучающей выборки соответственно.
    y_pred_rfe = rfe.predict(X_test_k)

    F = pd.DataFrame(rfe.support_)
    F = F.astype(int)
    get_bst_features['Sum'] += F[0]  
  return(get_bst_features)

In [None]:
get_bst_features_5_20 = get_best_features(5, 15, 20)
feat = feature_importances.head(20)
best_20_ftr_list = list(feat['Raman Shift'])
get_bst_features_5_20.index = best_20_ftr_list
get_bst_features_5_20 = get_bst_features_5_20.sort_values(by='Sum', ascending=False)
get_bst_features_5_20.head(10)

In [None]:
get_bst_features_5_20.head(10).index

In [None]:
ftr_list = [2276, 2310, 2188, 2313, 2297, 2254, 2296, 2317, 2262, 3273]
df_t = df.T
df_rfe = df_t.loc[ftr_list]
df_rfe = df_rfe.T
df_rfe['target'] = df.target
df_rfe

In [None]:
X_rfe = df_rfe.drop(['target'], axis=1)
y_rfe = df_rfe['target']

X_rfe_train, X_rfe_test, y_rfe_train, y_rfe_test = train_test_split(X_rfe, y_rfe, test_size=0.2, random_state=42)

In [None]:
RF = RandomForestClassifier()
RF.fit(X_rfe_train, y_rfe_train)
y_pred_rfe = RF.predict(X_rfe_test)

cm_rf_rfe = pd.DataFrame(confusion_matrix(y_rfe_test, y_pred_rfe))
sensitivity = cm_rf_rfe[0][0] / (cm_rf_rfe[0][0] + cm_rf_rfe[1][0])
specificity = cm_rf_rfe[1][1] / (cm_rf_rfe[0][1] + cm_rf_rfe[1][1])

RF_RFE_metric = [accuracy_score(y_rfe_test, y_pred_rfe), f1_score(y_rfe_test, y_pred_rfe), specificity, sensitivity]
RF_RFE_metric

In [None]:
res_all['RFE'] = RF_RFE_metric
res_all = res.round(3)
res_all

In [None]:
_res = res.drop(columns= ['LR', 'SVM', 'CB',	'ENS_1', 'ENS_2', 'ENS_5',	'ENS_4',	'K_means_1',	'K_means_2',	'K_means_3'])
_res.rename(columns = {'ВИ_7_лучших':'RF_ВИ_7', 'ВИ_10_лучших':'RF_ВИ_10', 'RF_EFS_1_6_признаков':'RF_EFS_6', 'RF_EFS_2_9_признаков':'RF_EFS_9'}, inplace = True )
_res

## 5.4. Random Forest Importance

In [None]:
# создаем случайное дерево с вашими гипер параметрами
model = RandomForestClassifier(n_estimators=100)

# Обучаем модель на вашей выборке; Где X, y - входные и выходные данные соответственно.
model.fit(X, y)

# Подбираем самые важные признаки
importances = model.feature_importances_

# Создаем отдельный датасет для визуализации
final_df = pd.DataFrame({"Raman Shift" : pd.DataFrame(X).columns, "Importances" : importances})
final_df

In [None]:
final_df.plot(x = 'Raman Shift', y = 'Importances', figsize=(20, 8))

In [None]:
final_df_sort = final_df.sort_values(by=['Importances'], ascending=False)
final_df_sort

In [None]:
def df_best_n_features_rfi(n):

  n_features = final_df_sort.head(n)
  best_n_ftr_list = list(n_features['Raman Shift'])

  df_t = df.T
  df_best_n_features = df_t.loc[best_n_ftr_list]
  df_best_n_features = df_best_n_features.T
  df_best_n_features['target'] = df.target

  return(df_best_n_features)

In [None]:
def score_from_n_to_m_best_features_rfi(n, m):

  bst_features = pd.DataFrame(index=range(n, m + 1),columns=['RFI F1 Score', 'RFI Accuracy'])

  for i in range (n, m + 1):

    f1_score_i = 0
    accuracy_i = 0

    df_best_i_features = df_best_n_features_rfi(i)

    for j in range (10):
      X_i = df_best_i_features.drop(['target'], axis=1)
      y_i = df_best_i_features['target']
      X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_i, y_i, test_size=0.2, random_state=42)

      RF = RandomForestClassifier()
      RF.fit(X_train_i, y_train_i)
      y_pred_rf_i = RF.predict(X_test_i)
      f1_score_i += f1_score(y_test_i, y_pred_rf_i)
      accuracy_i += accuracy_score(y_test_i, y_pred_rf_i)

    bst_features['RFI F1 Score'][i] = round(f1_score_i / 10, 3)
    bst_features['RFI Accuracy'][i] = round(accuracy_i / 10, 3)

  return(bst_features)

In [None]:
rfi_1_50 = score_from_n_to_m_best_features_rfi(1, 50)
rfi_1_50

In [None]:
plt.figure(figsize=(20, 10))
#plt.xticks(rotation=30)

plt.ylabel("Oценка Random Forest", fontsize=20)
plt.xlabel("Количество отобранных признаков", fontsize=20)

plt.title('Оценка метрик в зависимости от признаков')

plt.plot(rfi_1_50.index, rfi_1_50['RFI F1 Score'], label='Random forest RFI F1 Score')
plt.plot(rfi_1_50.index, rfi_1_50['RFI Accuracy'], label='Random forest RFI Accuracy')

plt.legend(prop={'size': 30})

In [None]:
rfi_7_15 = score_from_n_to_m_best_features_rfi(7, 15)
rfi_7_15

In [None]:
plt.figure(figsize=(20, 10))
#plt.xticks(rotation=30)

plt.ylabel("Oценка Random Forest", fontsize=20)
plt.xlabel("Количество отобранных признаков", fontsize=20)

plt.title('Оценка метрик в зависимости от признаков')

plt.plot(rfi_7_15.index, rfi_7_15['RFI F1 Score'], label='Random forest RFI F1 Score')
plt.plot(rfi_7_15.index, rfi_7_15['RFI Accuracy'], label='Random forest RFI Accuracy')

plt.legend(prop={'size': 20})

In [None]:
rfi_9_features = df_best_n_features_rfi(9)
rfi_9_features

In [None]:
rfi_9_features.columns

In [None]:
X_rfi = rfi_9_features.drop(['target'], axis=1)
y_rfi = rfi_9_features['target']

X_rfi_train, X_rfi_test, y_rfi_train, y_rfi_test = train_test_split(X_rfi, y_rfi, test_size=0.2, random_state=42)

In [None]:
RF = RandomForestClassifier()
RF.fit(X_rfi_train, y_rfi_train)
y_pred_rfi = RF.predict(X_rfi_test)

cm_rf_rfi = pd.DataFrame(confusion_matrix(y_rfi_test, y_pred_rfi))
sensitivity = cm_rf_rfi[0][0] / (cm_rf_rfi[0][0] + cm_rf_rfi[1][0])
specificity = cm_rf_rfi[1][1] / (cm_rf_rfi[0][1] + cm_rf_rfi[1][1])

RF_RFI_metric = [accuracy_score(y_rfi_test, y_pred_rfi), f1_score(y_rfi_test, y_pred_rfi), specificity, sensitivity]
RF_RFI_metric

In [None]:
res['RFI'] = RF_RFI_metric
res = res.round(3)
res

In [None]:
_res = res.drop(columns= ['LR', 'SVM', 'CB',	'ENS_1', 'ENS_2', 'ENS_5',	'ENS_4',	'K_means_1',	'K_means_2',	'K_means_3'])
_res.rename(columns = {'ВИ_7_лучших':'RF_ВИ_7', 'ВИ_10_лучших':'RF_ВИ_10', 'RF_EFS_1_6_признаков':'RF_EFS_6', 'RF_EFS_2_9_признаков':'RF_EFS_9'}, inplace = True )
_res

## 5.5. PCA

In [None]:
from sklearn.decomposition import PCA

_data = df.drop(['target'], axis=1)

PCA_output = PCA(n_components=10)
PCA_output.fit(_data)
sklearn_transformed_data = PCA_output.transform(_data)

In [None]:
_PCA = pd.DataFrame(sklearn_transformed_data)
_PCA['target'] = df.target
_PCA

In [None]:
X_pca = _PCA.drop(['target'], axis=1)
y_pca = _PCA['target']
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y_pca, test_size=0.2, random_state=42)

In [None]:
RF = RandomForestClassifier()
RF.fit(X_train_pca, y_train_pca)

In [None]:
y_pred_rf = RF.predict(X_test_pca)

print(f'Random forest F1 Score {f1_score(y_test_pca, y_pred_rf)}')
print(f'Random forest Accuracy {accuracy_score(y_test_pca, y_pred_rf)}')

In [None]:
def pca_from_n_to_m_components(n, m):

  pca = pd.DataFrame(index=range(n, m + 1),columns=['Random forest PCA Accuracy', 'Random forest PCA F1 Score', 'Random forest PCA Specificity', 'Random forest PCA Sensitivity'])

  _data = df.drop(['target'], axis=1)

  for i in range (n, m + 1):

    f1_score_i = 0
    accuracy_i = 0
    sensitivity_i = 0
    specificity_i = 0

    for j in range (10):

      PCA_output = PCA(n_components=i)
      PCA_output.fit(_data)
      sklearn_transformed_data = PCA_output.transform(_data)
      _PCA = pd.DataFrame(sklearn_transformed_data)
      _PCA['target'] = df.target

      X_pca = _PCA.drop(['target'], axis=1)
      y_pca = _PCA['target']
      X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y_pca, test_size=0.2, random_state=42)

      RF = RandomForestClassifier()
      RF.fit(X_train_pca, y_train_pca)
      y_pred_pca = RF.predict(X_test_pca)

      cm = pd.DataFrame(confusion_matrix(y_test_pca, y_pred_pca))
      sensitivity = cm[0][0] / (cm[0][0] + cm[1][0])
      specificity = cm[1][1] / (cm[0][1] + cm[1][1])

      f1_score_i += f1_score(y_test_pca, y_pred_pca)
      accuracy_i += accuracy_score(y_test_pca, y_pred_pca)
      sensitivity_i += sensitivity
      specificity_i += specificity

    pca['Random forest PCA F1 Score'][i] = round(f1_score_i / 10, 3)
    pca['Random forest PCA Accuracy'][i] = round(accuracy_i / 10, 3)
    pca['Random forest PCA Specificity'][i] = round(specificity_i / 10, 3)
    pca['Random forest PCA Sensitivity'][i] = round(sensitivity_i / 10, 3)

  return(pca)

In [None]:
pca_1_to_50 = pca_from_n_to_m_components(1, 50)
pca_1_to_50

In [None]:
plt.figure(figsize=(20, 10))
#plt.xticks(rotation=30)

plt.ylabel("Oценка Random Forest", fontsize=20)
plt.xlabel("Количество компонент", fontsize=20)

plt.title('Оценка метрик в зависимости от количества компонент в PCA')

plt.plot(pca_1_to_50.index, pca_1_to_50['Random forest PCA F1 Score'], label='Random forest PCA F1 Score')
plt.plot(pca_1_to_50.index, pca_1_to_50['Random forest PCA Accuracy'], label='Random forest PCA Accuracy')

plt.legend(prop={'size': 30})

In [None]:
pca_5_to_20 = pca_from_n_to_m_components(5, 20)
pca_5_to_20

In [None]:
plt.figure(figsize=(20, 8))
#plt.xticks(rotation=30)

plt.ylabel("Oценка Random Forest", fontsize=20)
plt.xlabel("Количество компонент", fontsize=20)

plt.title('Оценка метрик в зависимости от количества компонент в PCA')

plt.plot(pca_5_to_20.index, pca_5_to_20['Random forest PCA F1 Score'], label='Random forest PCA F1 Score')
plt.plot(pca_5_to_20.index, pca_5_to_20['Random forest PCA Accuracy'], label='Random forest PCA Accuracy')

plt.legend(prop={'size': 30})

In [None]:
pca_5_to_20 = pca_5_to_20.T

In [None]:
res['PCA_8'] = list(pca_5_to_20[8])
res['PCA_13'] = list(pca_5_to_20[13])

res = res.round(3)
res

In [None]:
_res = res.drop(columns= ['LR', 'SVM', 'CB',	'ENS_1', 'ENS_2', 'ENS_5',	'ENS_4',	'K_means_1',	'K_means_2',	'K_means_3'])
_res.rename(columns = {'ВИ_7_лучших':'RF_ВИ_7', 'ВИ_10_лучших':'RF_ВИ_10', 'RF_EFS_1_6_признаков':'RF_EFS_6', 'RF_EFS_2_9_признаков':'RF_EFS_9'}, inplace = True )
_res

In [None]:
#res.to_csv('all_result.zip', index=False, compression=compression_opts) 

In [None]:
res = pd.read_csv('result.csv')
res.index = ['Точность', 'F1 мера', 'Доля верно обнаруженных больных', 'Доля верно обнаруженных здоровых']
res

## 5.6. Объединим EFS, RFI и PCA

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

_data = df.drop(['target'], axis=1)

PCA_output = PCA(n_components=13)
PCA_output.fit(_data)
sklearn_transformed_data = PCA_output.transform(_data)
_PCA = pd.DataFrame(sklearn_transformed_data)
_PCA

In [None]:
efs_9 = [2276, 2254, 2282, 2316, 2310, 3273, 2313, 2271, 2322]
rfi_9 = [2186, 2142, 3537, 2177, 2152, 2149, 3544, 2147, 2185]
features_efs_rfi = set(rfi_9 + efs_9)
features_efs_rfi

In [None]:
df_t = df.T
df_ens = df_t.loc[features_efs_rfi]
df_ens = df_ens.T
df_ens['target'] = df.target
df_ens

In [None]:
df_ens = pd.concat([_PCA, df_ens], axis=1, join="inner")
df_ens

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

X_ens = df_ens.drop(['target'], axis=1)
y_ens = df_ens['target']

X_ens_train, X_ens_test, y_ens_train, y_ens_test = train_test_split(X_ens, y_ens, test_size=0.2, random_state=42)

In [None]:
RF = RandomForestClassifier()
RF.fit(X_ens_train, y_ens_train)
y_pred_ens = RF.predict(X_ens_test)

cm_rf_ens = pd.DataFrame(confusion_matrix(y_ens_test, y_pred_ens))
sensitivity = cm_rf_ens[0][0] / (cm_rf_ens[0][0] + cm_rf_ens[1][0])
specificity = cm_rf_ens[1][1] / (cm_rf_ens[0][1] + cm_rf_ens[1][1])

RF_ENS_metric = [accuracy_score(y_ens_test, y_pred_ens), f1_score(y_ens_test, y_pred_ens), specificity, sensitivity]
RF_ENS_metric

In [None]:
res['EFS_RFI_PCA'] = RF_ENS_metric
res = res.round(3)
res

In [None]:
_res = res.drop(columns= ['LR', 'SVM', 'CB',	'ENS_1', 'ENS_2', 'ENS_5',	'ENS_4',	'K_means_1',	'K_means_2',	'K_means_3'])
_res.rename(columns = {'ВИ_7_лучших':'RF_ВИ_7', 'ВИ_10_лучших':'RF_ВИ_10', 'RF_EFS_1_6_признаков':'RF_EFS_6', 'RF_EFS_2_9_признаков':'RF_EFS_9'}, inplace = True )
_res

# 6. Наиболее важные Raman Shifts

In [None]:
df

In [None]:
df.boxplot(column=[2282], by='target', grid= False , color='red', figsize=(5, 10)) 