In [88]:
import numpy as np
import pandas as pd
import time
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


In [89]:
file_path = "dataset/wine.data"

columns = [
    "Class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium",
    "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins",
    "Color intensity", "Hue", "OD280/OD315", "Proline"
]

# Загружаем датасет
df = pd.read_csv(file_path, header=None, names=columns)


X = df.iloc[:, 1:] 
y = df.iloc[:, 0]   



In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Параметры для GridSearch
# Param_grid в контексте библиотеки scikit-learn — словарь, задающий сетку параметров для поиска.
# Этот параметр используется в инструменте GridSearchCV,
# который позволяет выполнять исчерпывающий поиск по
# заданным значениям параметров для оценщика.
param_grid = {
    'DecisionTree': {'classifier__max_depth': [3, 5, 7, 13]},
    'RandomForest': {'classifier__n_estimators': [50, 100], 'classifier__max_depth': [5, 10]},
    'NaiveBayes': {'classifier__var_smoothing': [1e-9]},
    'SVM': {'classifier__C': [0.1, 1, 10], 'classifier__kernel': ['linear', 'rbf']}
}


In [91]:

# Модели и пайплайны
models = {
    'DecisionTree': Pipeline([('scaler', StandardScaler()), ('classifier', DecisionTreeClassifier())]),
    'RandomForest': Pipeline([('scaler', StandardScaler()), ('classifier', RandomForestClassifier())]),
    'NaiveBayes': Pipeline([('scaler', StandardScaler()), ('classifier', GaussianNB())]),
    'SVM': Pipeline([('scaler', StandardScaler()), ('classifier', SVC())])
}




In [92]:
# Сравнение методов в исходном пространстве
results = []
for model_name in models:
    start = time.time()
    grid = GridSearchCV(models[model_name], param_grid[model_name], cv=10)
    grid.fit(X_train, y_train)
    train_time = time.time() - start

    start = time.time()
    y_pred = grid.predict(X_test)
    predict_time = time.time() - start

    results.append({
        'Model': model_name,
        'Best Params': grid.best_params_,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Train Time': train_time,
        'Predict Time': predict_time
    })


In [93]:
# Сравнение с разной размерностью через PCA
dims = [2,5,10]
pca_results = []
for dim in dims:
    pca = PCA(n_components=dim)
    X_pca = pca.fit_transform(StandardScaler().fit_transform(X))
    X_train_pca, X_test_pca, y_train_pca, y_test_pca = \
        train_test_split(X_pca, y, test_size=0.3, random_state=42)

    for model_name in models:
        model = models[model_name]  # Используем весь пайплайн
        params = param_grid[model_name]
        grid = GridSearchCV(model, params, cv=10)

        start = time.time()
        grid.fit(X_train_pca, y_train_pca)
        train_time = time.time() - start

        start = time.time()
        y_pred = grid.predict(X_test_pca)
        predict_time = time.time() - start

        pca_results.append({
            'Dimension': dim,
            'Model': model_name,
            'Accuracy': accuracy_score(y_test_pca, y_pred),
            'Train Time': train_time,
            'Predict Time': predict_time
        })

# Вывод результатов
pd.set_option('display.max_columns', None)    # Снимаем ограничения с вывода
pd.set_option('display.max_colwidth', None)   # Снимаем ограничения с вывода

print("Результаты в исходном пространстве:")
print(pd.DataFrame(results))

print("\nРезультаты с PCA:")
print(pd.DataFrame(pca_results))



Результаты в исходном пространстве:
          Model                                                   Best Params  \
0  DecisionTree                                  {'classifier__max_depth': 3}   
1  RandomForest  {'classifier__max_depth': 5, 'classifier__n_estimators': 50}   
2    NaiveBayes                          {'classifier__var_smoothing': 1e-09}   
3           SVM          {'classifier__C': 1, 'classifier__kernel': 'linear'}   

   Accuracy  Train Time  Predict Time  
0  0.962963    0.198278      0.002301  
1  1.000000    2.457509      0.003444  
2  1.000000    0.039218      0.001673  
3  0.981481    0.215555      0.013025  

Результаты с PCA:
    Dimension         Model  Accuracy  Train Time  Predict Time
0           2  DecisionTree  0.981481    0.103575      0.000000
1           2  RandomForest  0.981481    2.269911      0.002006
2           2    NaiveBayes  0.981481    0.011958      0.000000
3           2           SVM  0.981481    0.142569      0.000000
4           5  Deci

In [94]:
# hyperparam_results_df = pd.DataFrame(hyperparam_results).T
# dims = [10,20,30]
# pca_results = []
# for dim in dims:
#     pca = PCA(n_components=dim)
#     X_pca = pca.fit_transform(StandardScaler().fit_transform(X))
#     X_train_pca, X_test_pca, y_train_pca, y_test_pca = \
#     train_test_split(X_pca, y, test_size = 0.3, random_state = 42)

#     for model_name in models:
#         model = models[model_name]
#         params = param_grid[model_name]
#         grid = GridSearchCV(model, params, cv = 10)

#         start = time.time()
#         grid.fit(X_train_pca, y_train_pca)
#         train_time = time.time() - start

#         start = time.time()
#         y_pred = grid.predict(X_test_pca)
#         predict_time = time.time() - start

#         pca_results.append({
#             'Размерность': dim,
#             'Метод': model_name,
#             'Точность': accuracy_score(y_test_pca, y_pred),
#             'Время': train_time,
#             'Предикт': predict_time,
#         })