<a href="https://colab.research.google.com/github/isaevadaryna/Machine-Learning/blob/main/%D0%9B%D0%B0%D0%B1_4%2C_%D0%86%D1%81%D0%B0%D1%94%D0%B2%D0%B0_%D0%94_%D0%9E_%2C_%D0%97%D0%B0%D0%B2%D0%B4%D0%B0%D0%BD%D0%BD%D1%8F_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

sns.set(style='whitegrid')
RANDOM_STATE = 42

In [2]:
data = load_breast_cancer(as_frame=True)
df_bc = data.frame.copy()
X = df_bc.drop(columns=['target'])
y = df_bc['target']

In [3]:
print("Перші 5 рядків:\n", df_bc.head())
print("\nНазви колонок:\n", df_bc.columns.tolist())
print("\nТипи даних:\n", df_bc.dtypes)
print("\nПропуски:\n", df_bc.isnull().sum())
print("\nРозмір даних:", df_bc.shape)

Перші 5 рядків:
    mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
print("X_train.shape:", X_train.shape, "X_test.shape:", X_test.shape)

X_train.shape: (455, 30) X_test.shape: (114, 30)


In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
lr = LogisticRegression(max_iter=10000)
lr_params = {'C':[0.01,0.1,1,10,100], 'solver':['liblinear','lbfgs']}
lr_grid = GridSearchCV(lr, lr_params, cv=5, scoring='accuracy')
lr_grid.fit(X_train_scaled, y_train)
y_pred_lr = lr_grid.predict(X_test_scaled)
print("Logistic Regression best params:", lr_grid.best_params_)

Logistic Regression best params: {'C': 0.1, 'solver': 'liblinear'}


In [8]:
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt_params = {'max_depth':[3,5,7,10,None], 'min_samples_split':[2,5,10]}
dt_grid = GridSearchCV(dt, dt_params, cv=5, scoring='accuracy')
dt_grid.fit(X_train_scaled, y_train)
y_pred_dt = dt_grid.predict(X_test_scaled)
print("Decision Tree best params:", dt_grid.best_params_)

Decision Tree best params: {'max_depth': 3, 'min_samples_split': 2}


In [9]:
rf = RandomForestClassifier(random_state=RANDOM_STATE)
rf_params = {'n_estimators':[50,100,200], 'max_depth':[5,10,None], 'min_samples_split':[2,5,10]}
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='accuracy')
rf_grid.fit(X_train_scaled, y_train)
y_pred_rf = rf_grid.predict(X_test_scaled)
print("Random Forest best params:", rf_grid.best_params_)

Random Forest best params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}


In [10]:
def evaluate_classification(y_true, y_pred, name):
    print(f"\n{name} Accuracy: {accuracy_score(y_true, y_pred):.3f}")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

evaluate_classification(y_test, y_pred_lr, "Logistic Regression")
evaluate_classification(y_test, y_pred_dt, "Decision Tree")
evaluate_classification(y_test, y_pred_rf, "Random Forest")


Logistic Regression Accuracy: 0.991
Confusion Matrix:
 [[42  1]
 [ 0 71]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.99      1.00      0.99        71

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114


Decision Tree Accuracy: 0.947
Confusion Matrix:
 [[39  4]
 [ 2 69]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.91      0.93        43
           1       0.95      0.97      0.96        71

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


Random Forest Accuracy: 0.965
Confusion Matrix:
 [[40  3]
 [ 1 70]]
Classification Report:
               precision    recall  f1-score   support

      

In [11]:
best_model = rf_grid  # припустимо, що Random Forest найкращий
y_pred_best = best_model.predict(X_test_scaled)

results_df = pd.DataFrame({
    'True': y_test.values,
    'Predicted': y_pred_best
})
print(results_df.head(20))

    True  Predicted
0      1          1
1      0          0
2      0          0
3      1          1
4      1          1
5      0          0
6      0          0
7      0          0
8      1          0
9      1          1
10     1          1
11     0          0
12     1          1
13     0          0
14     1          1
15     0          0
16     1          1
17     1          1
18     1          1
19     0          0
