Библиотеки

In [None]:
import os
import typing as tp
from itertools import product
import warnings

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

import cv2
import tifffile as tiff

import sklearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix

Пути и прочие константы

In [None]:
IMG_PATH = '/home/jupyter-igor_busov/Seed/Hyperspectral/data/pigment/'
IMG_PATH_WHITE = IMG_PATH + 'white/'
IMG_PATH_COLORED = IMG_PATH + 'colored/'
CALIBR_WHITE_PATH_WHITE = IMG_PATH_WHITE + 'White_session_000_000_cube.tiff'
CALIBR_BLACK_PATH_WHITE = IMG_PATH_WHITE + 'Black_session_000_004_snapshot_cube.tiff'
CALIBR_WHITE_PATH_COLORED = IMG_PATH_COLORED + 'White_session_000_000_snapshot_cube.tiff'
CALIBR_BLACK_PATH_COLORED = IMG_PATH_COLORED + 'Black_session_000_001_snapshot_cube.tiff'

$\textbf{Код}$

Таблица с информацией о пигментном составе цветных изображений

In [None]:
colored_pigment = pd.read_csv(IMG_PATH + 'colored_pigment.csv')
colored_pigment.head()

In [None]:
needed_columns = [ 'меланин', 'Unnamed: 3', 'антоцианы', 'Unnamed: 5', 'Unnamed: 8']

In [None]:
colored_pigment = colored_pigment[needed_columns].iloc[1:].rename(columns={ 'меланин': 'melanin_scales', 
                                                                    'Unnamed: 3': 'melanin_pericarp',
                                                                    'антоцианы': 'anthocyanins_scales',
                                                                    'Unnamed: 5': 'anthocyanins_pericarp',
                                                                    'Unnamed: 8': 'file_name'
                                                                }).reset_index(drop=True)
colored_pigment = colored_pigment.replace(['нет', 'да'], [0, 1])
colored_pigment.head()

In [None]:
colored_pigment['melanin'] = (colored_pigment['melanin_scales']
                              + colored_pigment['melanin_pericarp']).clip(0,1)
colored_pigment['anthocyanins'] = (colored_pigment['anthocyanins_scales']
                              + colored_pigment['anthocyanins_pericarp']).clip(0,1)
colored_pigment.head()

класс, описывающий гиперспектральное изображение с основными используемыми в этой работе методами 

In [None]:
class Hyper_Img:
    """
    Hyperspectral image with basic methods
    """
    
    def __init__(self, path: str, threshold_value: float = 7.5, 
                 savgol_par: tp.Tuple[int] = (9, 3)) -> None:
        self.savgol_par = savgol_par
        self._threshold_value = threshold_value
        self.path = path
        self.img = self._get_tiff()
        self.widht = self.img.shape[0]
        self.height = self.img.shape[1]
        self.pixels =  self._get_pixels()
        self.medians = self._get_medians()
        self.melanin, self.anthocyanins  = self._get_pigment()
        self.pigment: str = 'white'
        if self.melanin and self.anthocyanins:
            self.pigment = 'melanin and anthocyanins'
        elif self.melanin:
            self.pigment = 'only melanin'
        elif self.anthocyanins:
            self.pigment = 'only anthocyanins'
    
    @staticmethod
    def wave_len(x: int, step: int = 4, begin_wave_len: int = 450) -> int:
        return int((x - begin_wave_len) // step)
    
    def _get_tiff(self) -> None:
        img = tiff.imread(self.path)
        if set(IMG_PATH_WHITE.split('/')).issubset(set(self.path.split('/'))):
            bl_img = tiff.imread(CALIBR_BLACK_PATH_WHITE)
            wh_img = tiff.imread(CALIBR_WHITE_PATH_WHITE)
        else:
            bl_img = tiff.imread(CALIBR_BLACK_PATH_COLORED)
            wh_img = tiff.imread(CALIBR_WHITE_PATH_COLORED)
        new_img = np.where(bl_img > img, 0, img - bl_img)
        return new_img /(wh_img - bl_img)
    
    def _get_pigment(self) -> tp.Tuple[int, int]:
        if not set(IMG_PATH.split('/')).issubset(set(self.path.split('.')[0].split('/'))):
            raise NameError('Error in path')
            
        if np.any(np.isnan(self.medians)):
            return -3, -3
        
        if np.allclose(self.medians, np.ones(len(self.medians), dtype=float)) or \
           np.allclose(self.medians, np.zeros(len(self.medians), dtype=float)): 
            return -1, -1
        
        if set(IMG_PATH_WHITE.split('/')).issubset(set(self.path.split('/'))):
            return 0, 0
    
        name: str = '_'.join([s for s in self.path.split('/')[-1].split('_')
                        if s != 'snapshot' and s != 'cube.tiff' ])
            
        if name not in np.unique(colored_pigment.file_name):
            return -2, -2
        
        return colored_pigment[colored_pigment.file_name == name].melanin.iloc[0],\
                            colored_pigment[colored_pigment.file_name == name].anthocyanins.iloc[0]
        
    def _get_pixels(self) -> tp.List[tp.Tuple[int, int]]:
        return [(x, y) for x, y in product(range(self.widht), range(self.height)) 
                if self.threshold_bgr[x,y] != 0]
    
    def _get_medians(self) -> np.array:
        medians: tp.List[float] = list()
        for i in range(self.img.shape[2]):
            medians.append(np.median(np.array([self.img[p[0]][p[1]][i] for p in self.pixels])))
        return savgol_filter(np.array(medians), *self.savgol_par)
 
    @property
    def bgr(self) -> np.array:
        
        #To accurately display colors, you need to choose constants
        
        im_r = self.img[:,:,Hyper_Img.wave_len(630)]
        im_g = self.img[:,:,Hyper_Img.wave_len(510)]
        im_b = self.img[:,:,Hyper_Img.wave_len(450)]
    
        im_r = (im_r / im_r.max())*255
        im_g = (im_g / im_g.max())*255
        im_b = (im_b / im_b.max())*255
    
        im_r = np.clip(im_r,0,255).astype(np.uint8)
        im_g = np.clip(im_g,0,255).astype(np.uint8)
        im_b = np.clip(im_b,0,255).astype(np.uint8)
    
        im_bgr = np.zeros((self.widht, self.height, 3), dtype = np.uint8)
        im_bgr[:,:,0] = im_b
        im_bgr[:,:,1] = im_g
        im_bgr[:,:,2] = im_r
    
        return im_bgr
    
    @property
    def threshold_bgr(self) -> np.array:
        im_black = cv2.cvtColor(self.bgr, cv2.COLOR_BGR2GRAY)
        _, im_thr = cv2.threshold(im_black, self._threshold_value, 255, cv2.THRESH_BINARY)
        return im_thr
    
    def __repr__(self) -> str:
        fig, axes = plt.subplots(1, 2)
        
        axes[0].imshow(self.bgr, cmap = 'gray')
        axes[0].set_title('rgb visualization')
        
        axes[1].imshow(self.threshold_bgr, cmap = 'gray')
        axes[1].set_title('segmentation')
        
        if self.melanin == -1:
            return 'pigment: white or black image for calibration'
        elif self.melanin < 0:
            return 'unknown'
        return f'pigment: {self.pigment}'
     

Примеры изображений (визуализация не точно передает цвета)

In [None]:
Hyper_Img(CALIBR_WHITE_PATH_COLORED)

In [None]:
Hyper_Img(IMG_PATH_COLORED + 'session_001_001_snapshot_cube.tiff')

In [None]:
Hyper_Img(IMG_PATH_COLORED + 'session_001_004_snapshot_cube.tiff')

In [None]:
Hyper_Img(IMG_PATH_COLORED + 'session_001_000_snapshot_cube.tiff')

In [None]:
Hyper_Img(IMG_PATH_COLORED + 'session_001_007_snapshot_cube.tiff')

In [None]:
Hyper_Img(IMG_PATH_WHITE + 'session_001_013_snapshot_cube.tiff')

In [None]:
Hyper_Img(IMG_PATH_WHITE + 'session_001_037_snapshot_cube.tiff')

Функция, возвращающая необходимые имена в директории

In [None]:
def all_tiff_cube_img(path: str) -> tp.List[str]:
    img_names: tp.List[str] = list()
    for dirname, _, filenames in os.walk(path):
        for filename in filenames: 
            name = os.path.join(filename)
            if name.split('.')[-1] != 'tiff':
                continue
            if name.split('.')[0].split('_')[-1] != 'cube':
                continue
            img_names.append(dirname + '/' + name)
            
    return img_names 

In [None]:
warnings.simplefilter('ignore')

считываем все необходимые изображения 

In [None]:
#hyper_imgs - list with all hyperspectral images
hyper_imgs: tp.List[Hyper_Img] = [Hyper_Img(name) for name in all_tiff_cube_img(IMG_PATH)
                                   if Hyper_Img(name).melanin >= 0]        

$\textbf{Графики медиан для каждого канала}$

In [None]:
def get_all_medians(hyper_imgs: tp.List[Hyper_Img]) -> pd.DataFrame:
    """
    create DataFrame for graphics
    """
    
    x_axis: tp.List[int] = list(np.arange(0,138)*4 + 450)
    points: tp.List[tp.Tuple[float, float, int, str]] = list()
    
    for sample_number, sample in enumerate(hyper_imgs):
        
        if sample.melanin == -1:
            continue
        
        for p in zip(x_axis, sample.medians):
            points.append([p[0], p[1], sample_number, sample.pigment])
        

    return pd.DataFrame(points, columns = ['Wavelength', 'Median', 'Sample', 'Pigment'])      

In [None]:
df_all_med = get_all_medians(hyper_imgs)
df_all_med.sample(7)

In [None]:
plt.figure(figsize=(15,10))
sns.lineplot(data=df_all_med, x='Wavelength', y='Median', hue='Pigment')

$\textbf{PCA}$

In [None]:
#Пайплайн
feature_pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])

In [None]:
def get_med_df(hyper_imgs: tp.List[Hyper_Img]) -> pd.DataFrame:
    """
    create DataFrame for PCA
    """
        
    return pd.DataFrame([list(sample.medians) + [sample.pigment] for sample in hyper_imgs], 
                         columns = list(np.arange(0,138)*4 + 450) + ['Pigment'])    

In [None]:
df = get_med_df(hyper_imgs)
df.sample(7)

In [None]:
X = df.drop(['Pigment'], axis = 1)
X.head()

In [None]:
y = df[['Pigment']]
y.head()

обучение и процент дисперсии для каждой компаненты

In [None]:
X = feature_pipe.fit_transform(X)
feature_pipe['pca'].explained_variance_ratio_

визуализация

In [None]:
new_arr = np.array(tuple(zip(X[:,:2], y['Pigment'])))
lst_of_value = [(new_arr[i][0][0], new_arr[i][0][1], new_arr[i][1]) for i, _ in enumerate(new_arr)]
pd.DataFrame(lst_of_value, columns = ['1', '2', 'Pigment']).sample(7)

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(data=pd.DataFrame(lst_of_value, 
                                  columns=['1', '2', 'Mutation']), x='1', y='2', hue='Mutation')

$\textbf{Классификация}$

In [None]:
def get_med_df_bin_classific(hyper_imgs: tp.List[Hyper_Img], class_name: str) -> pd.DataFrame:
    """
    create DataFrame for binary classification
    """
    
    def get_class(pigment: str) -> int:
        if class_name in pigment:
            return 1
        return 0
    
    return pd.DataFrame([list(sample.medians) + [get_class(sample.pigment)] for sample in hyper_imgs], 
                         columns = list(np.arange(0,138)*4 + 450) + ['Pigment'])    

In [None]:
def get_med_df_multi_classific(hyper_imgs: tp.List[Hyper_Img]) -> pd.DataFrame:
    """
    create DataFrame for multi-class classification
    """
    
    def get_class(sample) -> int:
        if sample.melanin and sample.anthocyanins:
            return 3
        elif sample.anthocyanins:
            return 2
        elif sample.melanin:
            return 1
        return 0
    
    return pd.DataFrame([list(sample.medians) + [get_class(sample)] for sample in hyper_imgs
                         if sample.melanin >= 0], 
                         columns = list(np.arange(0,138)*4 + 450) + ['Pigment']) 

Стратифицированная кросс-валидация

In [None]:
cv = StratifiedKFold(n_splits=3)

$\textbf{Классификация по наличию меланина}$

информация о данных

In [None]:
df = get_med_df_bin_classific(hyper_imgs, 'melanin')
df.sample(7)

количество образцов каждого класса

In [None]:
df_count = df.groupby('Pigment', as_index=True).count()[[450]].rename(columns={450:'Count'}).reset_index()
df_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=df_count, x='Pigment', y='Count')

разделение выборки на обучающию и тестовую

In [None]:
X = df.drop(['Pigment'], axis = 1)
X.head()

In [None]:
y = df[['Pigment']]
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
test_count = y_test.reset_index().groupby('Pigment').count().reset_index().rename(columns={'index':'Count'})
test_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=test_count, x='Pigment', y='Count')

In [None]:
train_count = y_train.reset_index().groupby('Pigment').count().reset_index().rename(columns={'index':'Count'})
train_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=train_count, x='Pigment', y='Count')

Логистическая регрессия

In [None]:
logistic_regr_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('logistic_regr', LogisticRegressionCV(cv=cv))])

In [None]:
X_train = logistic_regr_pipe['feature'].fit_transform(X_train)
X_test = logistic_regr_pipe['feature'].transform(X_test)

In [None]:
logistic_regr_pipe['logistic_regr'].fit(X_train, y_train)

In [None]:
logistic_regr_pipe['logistic_regr'].intercept_

Random Forest

In [None]:
parameters_random_forest = { 'max_depth':[2, 7], 'min_samples_split': [1, 5], 
                            'min_samples_leaf': [1, 5]}

In [None]:
clf_forest = GridSearchCV(RandomForestClassifier(n_estimators=150), parameters_random_forest, cv=cv)
clf_forest.fit(X_train, y_train)

In [None]:
clf_forest.best_params_

In [None]:
random_forest_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('random_forest', clf_forest.best_estimator_)])

Градиентный бустинг

In [None]:
parameters_boosting = { 'learning_rate':np.power(2, np.arange(10))}

In [None]:
clf_boosting = GridSearchCV(GradientBoostingClassifier(n_estimators=150), parameters_boosting, cv=cv)
clf_boosting.fit(X_train, y_train)

In [None]:
clf_boosting.best_params_

In [None]:
gradient_boosting_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('gradient_boosting', clf_boosting.best_estimator_)])

Результаты

таблица с результатами моделей по основным метрикам

In [None]:
def get_metrics(models: tp.List[sklearn.base.BaseEstimator], models_name: tp.List[str],
                X: np.array, y: np.array) -> pd.DataFrame:
    y_predicts = [model.predict(X) for model in models]
    accuracy: tp.List[float] = [accuracy_score(y, y_predict) for y_predict in y_predicts]
    f1: tp.List[float] = [f1_score(y, y_predict) for y_predict in y_predicts]
    precision: tp.List[float] = [precision_score(y, y_predict) for y_predict in y_predicts]
    recall: tp.List[float] = [recall_score(y, y_predict) for y_predict in y_predicts]
    return pd.DataFrame(zip(models_name, accuracy, f1, precision, recall), 
                        columns = ['model', 'accuracy', 'f1', 'precision', 'recall'])

Результаты на тренеровочной выборки

In [None]:
get_metrics([logistic_regr_pipe['logistic_regr'], random_forest_pipe['random_forest'], 
             gradient_boosting_pipe['gradient_boosting']], 
             ['Логистическая регрессия', 'Random Forest', 'Градиентный бустинг'], X_train, y_train)

Результаты на тестовой выборки

In [None]:
get_metrics([logistic_regr_pipe['logistic_regr'], random_forest_pipe['random_forest'], 
             gradient_boosting_pipe['gradient_boosting']], 
             ['Логистическая регрессия', 'Random Forest', 'Градиентный бустинг'], X_test, y_test)

$\textbf{Классификация по наличию антоцианов}$

In [None]:
df = get_med_df_bin_classific(hyper_imgs, 'anthocyanins')
df.sample(7)

количество образцов каждого класса

In [None]:
df_count = df.groupby('Pigment', as_index=True).count()[[450]].rename(columns={450:'Count'}).reset_index()
df_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=df_count, x='Pigment', y='Count')

разделение выборки на обучающию и тестовую

In [None]:
X = df.drop(['Pigment'], axis = 1)
X.head()

In [None]:
y = df[['Pigment']]
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
test_count = y_test.reset_index().groupby('Pigment').count().reset_index().rename(columns={'index':'Count'})
test_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=test_count, x='Pigment', y='Count')

In [None]:
train_count = y_train.reset_index().groupby('Pigment').count().reset_index().rename(columns={'index':'Count'})
train_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=train_count, x='Pigment', y='Count')

Логистическая регрессия

In [None]:
logistic_regr_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('logistic_regr', LogisticRegressionCV(cv=cv))])

In [None]:
X_train = logistic_regr_pipe['feature'].fit_transform(X_train)
X_test = logistic_regr_pipe['feature'].transform(X_test)

In [None]:
logistic_regr_pipe['logistic_regr'].fit(X_train, y_train)

In [None]:
logistic_regr_pipe['logistic_regr'].intercept_

Random Forest

In [None]:
parameters_random_forest = { 'max_depth':[2, 5], 'min_samples_split': [1, 5], 
                            'min_samples_leaf': [1, 5]}

In [None]:
clf_forest = GridSearchCV(RandomForestClassifier(n_estimators=150), parameters_random_forest, cv=cv)
clf_forest.fit(X_train, y_train)

In [None]:
clf_forest.best_params_

In [None]:
random_forest_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('random_forest', clf_forest.best_estimator_)])

Градиентный бустинг

In [None]:
parameters_boosting = { 'learning_rate':np.power(2, np.arange(10))}

In [None]:
clf_boosting = GridSearchCV(GradientBoostingClassifier(n_estimators=150), parameters_boosting, cv=cv)
clf_boosting.fit(X_train, y_train)

In [None]:
clf_boosting.best_params_

In [None]:
gradient_boosting_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('gradient_boosting', clf_boosting.best_estimator_)])

Результаты

Результаты на тренеровочной выборки

In [None]:
get_metrics([logistic_regr_pipe['logistic_regr'], random_forest_pipe['random_forest'], 
             gradient_boosting_pipe['gradient_boosting']], 
             ['Логистическая регрессия', 'Random Forest', 'Градиентный бустинг'], X_train, y_train)

Результаты на тестовой выборки

In [None]:
get_metrics([logistic_regr_pipe['logistic_regr'], random_forest_pipe['random_forest'], 
             gradient_boosting_pipe['gradient_boosting']], 
             ['Логистическая регрессия', 'Random Forest', 'Градиентный бустинг'], X_test, y_test)

$\textbf{Классификация по наличию пигмента}$

информация о данных

In [None]:
df = get_med_df_bin_classific(hyper_imgs, 'white')
df.sample(7)

количество образцов каждого класса

In [None]:
df_count = df.groupby('Pigment', as_index=True).count()[[450]].rename(columns={450:'Count'}).reset_index()
df_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=df_count, x='Pigment', y='Count')

разделение выборки на обучающию и тестовую

In [None]:
X = df.drop(['Pigment'], axis = 1)
X.head()

In [None]:
y = df[['Pigment']]
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
test_count = y_test.reset_index().groupby('Pigment').count().reset_index().rename(columns={'index':'Count'})
test_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=test_count, x='Pigment', y='Count')

In [None]:
train_count = y_train.reset_index().groupby('Pigment').count().reset_index().rename(columns={'index':'Count'})
train_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=train_count, x='Pigment', y='Count')

Логистическая регрессия

In [None]:
logistic_regr_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('logistic_regr', LogisticRegressionCV(cv=cv))])

In [None]:
X_train = logistic_regr_pipe['feature'].fit_transform(X_train)
X_test = logistic_regr_pipe['feature'].transform(X_test)

In [None]:
logistic_regr_pipe['logistic_regr'].fit(X_train, y_train)

In [None]:
logistic_regr_pipe['logistic_regr'].intercept_

Random Forest

In [None]:
parameters_random_forest = { 'max_depth':[2, 7], 'min_samples_split': [1, 5], 
                            'min_samples_leaf': [1, 5]}

In [None]:
clf_forest = GridSearchCV(RandomForestClassifier(n_estimators=150), parameters_random_forest, cv=cv)
clf_forest.fit(X_train, y_train)

In [None]:
clf_forest.best_params_

In [None]:
random_forest_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('random_forest', clf_forest.best_estimator_)])

Градиентный бустинг

In [None]:
parameters_boosting = { 'learning_rate':np.power(2, np.arange(10))}

In [None]:
clf_boosting = GridSearchCV(GradientBoostingClassifier(n_estimators=150), parameters_boosting, cv=cv)
clf_boosting.fit(X_train, y_train)

In [None]:
clf_boosting.best_params_

In [None]:
gradient_boosting_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('gradient_boosting', clf_boosting.best_estimator_)])

Результаты

Результаты на тренеровочной выборки

In [None]:
get_metrics([logistic_regr_pipe['logistic_regr'], random_forest_pipe['random_forest'], 
             gradient_boosting_pipe['gradient_boosting']], 
             ['Логистическая регрессия', 'Random Forest', 'Градиентный бустинг'], X_train, y_train)

Результаты на тестовой выборки

In [None]:
get_metrics([logistic_regr_pipe['logistic_regr'], random_forest_pipe['random_forest'], 
             gradient_boosting_pipe['gradient_boosting']], 
             ['Логистическая регрессия', 'Random Forest', 'Градиентный бустинг'], X_test, y_test)

$\textbf{Мультиклассовая классификация}$

In [None]:
df = get_med_df_multi_classific(hyper_imgs)
df.sample(7)

In [None]:
df_count = df.groupby(['Pigment'], as_index=True).count()[[450]].rename(columns={450:'Count'}).reset_index()
df_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=df_count, x='Pigment', y='Count')

разделение выборки на обучающию и тестовую

In [None]:
X = df.drop(['Pigment'], axis = 1)
X.head()

In [None]:
y = df[['Pigment']]
y.sample(5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
test_count = y_test.reset_index().groupby('Pigment').count().reset_index().rename(columns={'index':'Count'})
test_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=test_count, x='Pigment', y='Count')

In [None]:
train_count = y_train.reset_index().groupby(['Pigment']).count().reset_index().rename(columns={'index':'Count'})
train_count

In [None]:
plt.figure(figsize=(9,9))
sns.barplot(data=train_count, x='Pigment', y='Count')

Логистическая регрессия

In [None]:
logistic_regr_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('logistic_regr', LogisticRegressionCV(cv=cv))])

In [None]:
X_train = logistic_regr_pipe['feature'].fit_transform(X_train)
X_test = logistic_regr_pipe['feature'].transform(X_test)

In [None]:
logistic_regr_pipe['logistic_regr'].fit(X_train, y_train)

In [None]:
logistic_regr_pipe['logistic_regr'].intercept_

In [None]:
conf_matrix = confusion_matrix(y_test, logistic_regr_pipe['logistic_regr'].predict(X_test))
conf_matrix = pd.DataFrame(conf_matrix, columns=[0, 1, 2, 3])
conf_matrix

In [None]:
plt.figure(figsize=(9,7))
sns.heatmap(conf_matrix, annot=True)

Random Forest

In [None]:
parameters_random_forest = { 'max_depth':[2, 5], 'min_samples_leaf': [1, 5]}

In [None]:
clf_forest = GridSearchCV(RandomForestClassifier(n_estimators=150), 
                          parameters_random_forest, cv=cv)
clf_forest.fit(X_train, y_train)

In [None]:
clf_forest.best_params_

In [None]:
random_forest_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('random_forest', clf_forest.best_estimator_)])

In [None]:
 random_forest_pipe['random_forest'].n_classes_

In [None]:
conf_matrix = confusion_matrix(y_test, random_forest_pipe['random_forest'].predict(X_test))
conf_matrix = pd.DataFrame(conf_matrix, columns=[0, 1, 2, 3])
conf_matrix

In [None]:
plt.figure(figsize=(9,7))
sns.heatmap(conf_matrix, annot=True)

Градиентный бустинг

In [None]:
parameters_boosting = { 'learning_rate':np.power(2, np.arange(10))}

In [None]:
clf_boosting = GridSearchCV(GradientBoostingClassifier(n_estimators=150), parameters_boosting, cv=cv)
clf_boosting.fit(X_train, y_train)

In [None]:
clf_boosting.best_params_

In [None]:
gradient_boosting_pipe = Pipeline([('feature', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=5))])),
                               ('gradient_boosting', clf_boosting.best_estimator_)])

In [None]:
 gradient_boosting_pipe['gradient_boosting'].n_classes_

In [None]:
conf_matrix = confusion_matrix(y_test, gradient_boosting_pipe['gradient_boosting'].predict(X_test))
conf_matrix = pd.DataFrame(conf_matrix, columns=[0, 1, 2, 3])
conf_matrix

In [None]:
plt.figure(figsize=(9,7))
sns.heatmap(conf_matrix, annot=True)

Результаты

In [None]:
def muticlass_predict_to_multilabel(y: np.array) -> tp.Tuple[np.array, np.array]:
    y_first_cl = np.zeros(len(y))
    y_second_cl = np.zeros(len(y))
    for idx, y_val in enumerate(y):
        if y_val == 3:
            y_first_cl[idx] = 1
            y_second_cl[idx] = 1
        elif y_val == 2:
            y_first_cl[idx] = 0
            y_second_cl[idx] = 1
        elif y_val == 1:
            y_first_cl[idx] = 1
            y_second_cl[idx] = 0
    return y_first_cl, y_second_cl

In [None]:
def get_average_metrics(models: tp.List[sklearn.base.BaseEstimator], models_name: tp.List[str],
                X: np.array, y: np.array) -> pd.DataFrame:
    y_multilabel = muticlass_predict_to_multilabel(y)
    y_predicts = [model.predict(X) for model in models]
    accuracy: tp.List[float] = [sum([accuracy_score(y_multilabel[i], y_pr) for i, y_pr 
                                     in enumerate(muticlass_predict_to_multilabel(y_predict))])/len(y_multilabel) 
                                for y_predict in y_predicts]
    f1: tp.List[float] = [sum([f1_score(y_multilabel[i], y_pr) for i, y_pr 
                                     in enumerate(muticlass_predict_to_multilabel(y_predict))])/len(y_multilabel) 
                                for y_predict in y_predicts]
    precision: tp.List[float] = [sum([precision_score(y_multilabel[i], y_pr) for i, y_pr 
                                     in enumerate(muticlass_predict_to_multilabel(y_predict))])/len(y_multilabel) 
                                for y_predict in y_predicts]
    recall: tp.List[float] = [sum([recall_score(y_multilabel[i], y_pr) for i, y_pr 
                                     in enumerate(muticlass_predict_to_multilabel(y_predict))])/len(y_multilabel) 
                                for y_predict in y_predicts]
    return pd.DataFrame(zip(models_name, accuracy, f1, precision, recall), 
                        columns = ['model', 'accuracy', 'f1', 'precision', 'recall'])

Результаты на тренеровочной выборки

In [None]:
get_average_metrics([logistic_regr_pipe['logistic_regr'], random_forest_pipe['random_forest'], 
             gradient_boosting_pipe['gradient_boosting']], 
             ['Логистическая регрессия', 'Random Forest', 'Градиентный бустинг'], X_train, y_train['Pigment'])

Результаты на тестовой выборки

In [None]:
get_average_metrics([logistic_regr_pipe['logistic_regr'], random_forest_pipe['random_forest'], 
             gradient_boosting_pipe['gradient_boosting']], 
             ['Логистическая регрессия', 'Random Forest', 'Градиентный бустинг'], X_test, y_test['Pigment'])