### Подключение библиотек и загрузка датасета.

In [38]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.colors import qualitative

In [39]:
FILE_PATH = "http://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data"

df = pd.read_csv(FILE_PATH)

### Исследование данных

In [40]:
display(df.head())

df.info()

display(df.describe())

print(f"\nКоличество пропущенных значений: {df.isnull().sum().sum()}")

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,...,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,154.228641,197.104918,116.324631,0.00622,4.4e-05,0.003306,0.003446,0.00992,0.029709,0.282251,...,0.046993,0.024847,21.885974,0.753846,0.498536,0.718099,-5.684397,0.22651,2.381826,0.206552
std,41.390065,91.491548,43.521413,0.004848,3.5e-05,0.002968,0.002759,0.008903,0.018857,0.194877,...,0.030459,0.040418,4.425764,0.431878,0.103942,0.055336,1.090208,0.083406,0.382799,0.090119
min,88.333,102.145,65.476,0.00168,7e-06,0.00068,0.00092,0.00204,0.00954,0.085,...,0.01364,0.00065,8.441,0.0,0.25657,0.574282,-7.964984,0.006274,1.423287,0.044539
25%,117.572,134.8625,84.291,0.00346,2e-05,0.00166,0.00186,0.004985,0.016505,0.1485,...,0.024735,0.005925,19.198,1.0,0.421306,0.674758,-6.450096,0.174351,2.099125,0.137451
50%,148.79,175.829,104.315,0.00494,3e-05,0.0025,0.00269,0.00749,0.02297,0.221,...,0.03836,0.01166,22.085,1.0,0.495954,0.722254,-5.720868,0.218885,2.361532,0.194052
75%,182.769,224.2055,140.0185,0.007365,6e-05,0.003835,0.003955,0.011505,0.037885,0.35,...,0.060795,0.02564,25.0755,1.0,0.587562,0.761881,-5.046192,0.279234,2.636456,0.25298
max,260.105,592.03,239.17,0.03316,0.00026,0.02144,0.01958,0.06433,0.11908,1.302,...,0.16942,0.31482,33.047,1.0,0.685151,0.825288,-2.434031,0.450493,3.671155,0.527367



Количество пропущенных значений: 0


In [41]:
status_counts = df['status'].value_counts()

fig = go.Figure(data=[go.Pie(
    labels=['Болен (1)', 'Здоров (0)'],
    values=status_counts.values,
    hole=.4,
    marker_colors=['#EF553B', '#636EFA'],
    pull=[0.05, 0]
)])

fig.update_layout(
    title_text='<b>Распределение пациентов</b>',
    title_x=0.5,
    font=dict(family="Arial, sans-serif", size=14),
    legend_title_text='Статус',
    width=600
)

fig.show()

Наблюдается дисбаланс классов. Больных почти в 3 раза больше чем здоровых.

In [42]:
features_to_plot = ['Jitter:DDP', 'Shimmer:APQ5', 'MDVP:Fo(Hz)', 'HNR']

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=[f"<b>{f}</b>" for f in features_to_plot]
)

colors = qualitative.Vivid

for i, feature in enumerate(features_to_plot):
    row, col = (i // 2) + 1, (i % 2) + 1
    fig.add_trace(
        go.Histogram(
            x=df[feature],
            name=feature,
            marker_color=colors[i]
        ),
        row=row,
        col=col
    )

fig.update_layout(
    title_text="<b>Распределение ключевых признаков</b>",
    title_x=0.5,
    showlegend=False,
    height=600,
    width=900,
    font=dict(family="Arial, sans-serif", size=12)
)

fig.show()

Признаки имеют разный масштаб. Необходима нормализация данных.

In [43]:
corr_matrix = df.drop('name', axis=1).corr()

fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='RdBu_r',
    zmin=-1, zmax=1
))

fig.update_layout(
    title='<b>Корреляции признаков</b>',
    title_x=0.5,
    width=1000,
    height=1000,
    xaxis_tickangle=-45
)

fig.show()

Многие признаки каррелируют друг с другом. Например, различные виды Jitter и Shimmer.
spread1, spread2 и PPE также показывают заментную корреляцию.

### Нормализация данных. Разделение выборки.

In [44]:
X = df.drop(['name', 'status'], axis=1)
y = df['status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [45]:
neg_count = y_train.value_counts()[0]
pos_count = y_train.value_counts()[1]

scale_pos_weight = neg_count / pos_count

### Создание и обучение модели.

In [46]:
model_xgb = XGBClassifier(
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_estimators=300,
    learning_rate=0.1962901820525862,
    subsample=0.9537725990269001,
    colsample_bytree=0.7338995278165954,
    gamma=0.2593377070905543,
    reg_lambda=0.1974641704725266,
)

model_xgb.fit(X_train_scaled, y_train)
print("Модель обучена!")

Модель обучена!


In [47]:
def create_evaluation_dashboard(model, X_test, y_test, target_names=['Здоров (0)', 'Паркинсон (1)']):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"\nТочность модели на тестовой выборке: {accuracy*100:.2f}%")
    print("\nОтчет о классификации:")
    print(classification_report(y_test, y_pred, target_names=target_names))

    cm = confusion_matrix(y_test, y_pred)

    fig = make_subplots(
        rows=1,
        cols=2,
        column_widths=[0.55, 0.45],
        specs=[[{'type': 'heatmap'}, {'type': 'domain'}]],
        subplot_titles=("<b>Матрица ошибок</b>", "<b>Точность модели</b>")
    )

    labels = ['Истинный Негатив', 'Ложный Позитив', 'Ложный Негатив', 'Истинный Позитив']
    counts = cm.flatten()
    annotations_text = [f'{label}<br>{count}' for label, count in zip(labels, counts)]
    annotations_text = np.array(annotations_text).reshape(2, 2)

    heatmap = go.Heatmap(
        z=cm,
        x=[f'Предсказано: {name}' for name in target_names],
        y=[f'Реальность: {name}' for name in target_names],
        text=annotations_text,
        texttemplate="%{text}",
        colorscale='Blues',
        showscale=False
    )
    fig.add_trace(heatmap, row=1, col=1)

    indicator = go.Indicator(
        mode="gauge+number",
        value=accuracy * 100,
        number={'suffix': "%", 'font': {'size': 38}},
        gauge={
            'axis': {'range': [80, 100]},
            'bar': {'color': "rgba(0, 0, 128, 0.7)"},
            'steps': [
                {'range': [0, 90], 'color': '#EA4335'},
                {'range': [90, 95], 'color': '#FBBC05'},
                {'range': [95, 100], 'color': '#34A853'}
            ],
            'threshold': {
                'line': {'color': "red", 'width': 4},
                'thickness': 0.9,
                'value': 90
            }
        }
    )
    fig.add_trace(indicator, row=1, col=2)

    fig.update_layout(
        title_text="<b>Оценка производительности</b>",
        title_x=0.5,
        font=dict(family="Arial, sans-serif", size=14, color="black"),
        height=600,
        width=1200
    )
    fig.update_xaxes(title_text="Предсказанный класс", row=1, col=1)
    fig.update_yaxes(title_text="Истинный класс", row=1, col=1)

    return fig


evaluation_fig = create_evaluation_dashboard(model_xgb, X_test_scaled, y_test)

evaluation_fig.show()


Точность модели на тестовой выборке: 97.44%

Отчет о классификации:
               precision    recall  f1-score   support

   Здоров (0)       0.91      1.00      0.95        10
Паркинсон (1)       1.00      0.97      0.98        29

     accuracy                           0.97        39
    macro avg       0.95      0.98      0.97        39
 weighted avg       0.98      0.97      0.97        39



### Сохранение модели

In [48]:
import joblib

joblib.dump(model_xgb, 'is_parkinsons_model.joblib')

['is_parkinsons_model.joblib']