In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('GlobalYouTubeStatistics.csv', delimiter=',', encoding='latin1')

In [11]:
from sklearn.model_selection import train_test_split

x = data.drop('subscribers', axis=1)
y = data['subscribers']

# Переведення категоріальних даних у числові
x_changed = pd.get_dummies(x, drop_first=True)
x_changed.head()

# Випадкове розбиття на навчальний та тестовий набори
X_train, X_test, y_train, y_test = train_test_split(x_changed, y, test_size=0.3)

In [12]:
from sklearn.tree import DecisionTreeClassifier

# Створюємо модель дерева рішень
model = DecisionTreeClassifier()

# Навчаємо модель на навчальних даних
model.fit(X_train, y_train)

In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Оцінка якості моделі
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)
precision = precision_score(y_test, test_predict, average='macro', zero_division=0)
recall = recall_score(y_test, test_predict, average='macro', zero_division=0)
f1 = f1_score(y_test, test_predict, average='macro', zero_division=0)

# Точність
print("Precision:", precision)

# Повнота
print("Recall:", recall)

# Точність і повнота в одному числі
print("F1 Score:", f1)

Precision: 0.2736322188449848
Recall: 0.3001899696048632
F1 Score: 0.27445621288006505


In [14]:
# Матриця плутанини
conf_matrix = confusion_matrix(y_test, test_predict)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[4 0 0 ... 0 0 0]
 [0 8 0 ... 0 0 0]
 [0 0 2 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [15]:
from sklearn.metrics import accuracy_score

# Підбір оптимальних параметрів моделі

best_train_accuracy = 0
best_test_accuracy = 0
best_max_depth = 1

for depth in range(1, 11):
    model = DecisionTreeClassifier(max_depth=depth)
    model.fit(X_train, y_train)
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    
    train_accuracy = accuracy_score(y_train, train_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    
    if test_accuracy > best_test_accuracy:
        best_test_accuracy = test_accuracy
        best_train_accuracy = train_accuracy
        best_max_depth = depth

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print("Train accuracy:", train_accuracy)
print("Test accuracy:", test_accuracy)
print("------------------------------")
print("Best test accuracy:", best_test_accuracy)
print("Best train accuracy:", best_train_accuracy)
print("Best max depth:", best_max_depth)

Train accuracy: 0.5086206896551724
Test accuracy: 0.38461538461538464
------------------------------
Best test accuracy: 0.38461538461538464
Best train accuracy: 0.5086206896551724
Best max depth: 10


In [16]:
# Перебудова дерева
rebuild_model = DecisionTreeClassifier(max_depth=best_max_depth)
rebuild_model.fit(X_train, y_train)
rebuild_train_prediction = rebuild_model.predict(X_train)
rebuild_test_prediction = rebuild_model.predict(X_test)

In [17]:
# Оцінка параметрів перебудованої моделі
print("Train accuracy:", accuracy_score(y_train, rebuild_train_prediction))
print("Test accuracy:", accuracy_score(y_test, rebuild_test_prediction))

Train accuracy: 0.3089080459770115
Test accuracy: 0.22742474916387959
