<a href="https://colab.research.google.com/github/isaevadaryna/Machine-Learning/blob/main/%D0%9B%D0%B0%D0%B1_4%2C_%D0%86%D1%81%D0%B0%D1%94%D0%B2%D0%B0_%D0%94_%D0%9E_%2C_%D0%97%D0%B0%D0%B2%D0%B4%D0%B0%D0%BD%D0%BD%D1%8F_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

RANDOM_STATE = 42

In [4]:
sheet_url = "https://docs.google.com/spreadsheets/d/1UC_7oeJsAVItfwdqfey267eLSBx7MsiiXSr_bYTHokU/export?format=csv"
df_titanic = pd.read_csv(sheet_url)
print("Перші 5 рядків:\n", df_titanic.head())
print("\nТипи даних:\n", df_titanic.dtypes)
print("\nПропуски:\n", df_titanic.isnull().sum())
print("\nРозмір даних:", df_titanic.shape)

Перші 5 рядків:
    PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  

Типи даних:
 PassengerId      int64
Survived         int64
Pclass   

In [5]:
# Заповнюємо пропуски
df_titanic['Age'] = df_titanic['Age'].fillna(df_titanic['Age'].median())
df_titanic['Embarked'] = df_titanic['Embarked'].fillna(df_titanic['Embarked'].mode()[0])

# Категоріальні у числові
df_titanic['Sex'] = LabelEncoder().fit_transform(df_titanic['Sex'])
df_titanic['Embarked'] = LabelEncoder().fit_transform(df_titanic['Embarked'])

# Вибір ознак та цілі
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
target = 'Survived'
X = df_titanic[features]
y = df_titanic[target]

# Масштабування
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=RANDOM_STATE)

In [8]:
# Переконаємося, що всі колонки числові
X_train = pd.DataFrame(X_train, columns=features)
X_test = pd.DataFrame(X_test, columns=features)

# Заповнимо можливі NaN нулями (або середнім)
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# Масштабування
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
lr = LogisticRegression(max_iter=10000)
lr_params = {'C':[0.01,0.1,1,10], 'solver':['liblinear','lbfgs']}
lr_grid = GridSearchCV(lr, lr_params, cv=5, scoring='accuracy')
lr_grid.fit(X_train_scaled, y_train)
y_pred_lr = lr_grid.predict(X_test_scaled)

# Decision Tree
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt_params = {'max_depth':[3,5,7,10,None], 'min_samples_split':[2,5,10]}
dt_grid = GridSearchCV(dt, dt_params, cv=5, scoring='accuracy')
dt_grid.fit(X_train_scaled, y_train)
y_pred_dt = dt_grid.predict(X_test_scaled)

# Random Forest
rf = RandomForestClassifier(random_state=RANDOM_STATE)
rf_params = {'n_estimators':[50,100,200], 'max_depth':[5,10,None], 'min_samples_split':[2,5,10]}
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='accuracy')
rf_grid.fit(X_train_scaled, y_train)
y_pred_rf = rf_grid.predict(X_test_scaled)

In [9]:
def evaluate_classification(y_true, y_pred, name):
    print(f"\n{name} Accuracy: {accuracy_score(y_true, y_pred):.3f}")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

evaluate_classification(y_test, y_pred_lr, "Logistic Regression")
evaluate_classification(y_test, y_pred_dt, "Decision Tree")
evaluate_classification(y_test, y_pred_rf, "Random Forest")


Logistic Regression Accuracy: 1.000
Confusion Matrix:
 [[50  0]
 [ 0 34]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84


Decision Tree Accuracy: 1.000
Confusion Matrix:
 [[50  0]
 [ 0 34]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84


Random Forest Accuracy: 1.000
Confusion Matrix:
 [[50  0]
 [ 0 34]]
Classification Report:
               precision    recall  f1-score   support

      

In [10]:
X_sample = X_test[:10]
y_sample_true = y_test[:10]
best_model = rf_grid  # вибираємо найкращу модель
y_sample_pred = best_model.predict(X_sample)

results_sample = pd.DataFrame({
    'True': y_sample_true.values,
    'Predicted': y_sample_pred
})
print("\nПрогноз для 10 випадків:\n", results_sample)


Прогноз для 10 випадків:
    True  Predicted
0     0          0
1     1          1
2     0          0
3     0          0
4     1          1
5     0          0
6     1          1
7     0          0
8     1          1
9     0          0


