In [1]:
import pandas
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Students performance in exams
# https://www.kaggle.com/spscientist/students-performance-in-exams

data = pandas.read_csv('C:\\Users\\aoglo\\Documents\\university\\NIR\\practice_4\\StudentsPerformance.csv')
data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [3]:
data_sel = data.loc[:, data.columns.isin(['test preparation course',  
                                          'gender', 
                                          'parental level of education', 
                                          'math score'])]

data_sel = data_sel.dropna()
data_sel['gender'] = np.where(data_sel['gender'] == 'male', 1, 0)

# Изменяем значения параметра parental level of education на
# high school/some high school -> 0
# some college -> 1
# associate's degree -> 2
# bachelor's degree -> 3
# master's degree -> 4

cleanup_lvl_edu = {
    'parental level of education':{
        'high school': 0,
        'some high school': 0,
        'some college': 1,
        'associate\'s degree': 2,
        'bachelor\'s degree': 3,
        'master\'s degree': 4
    }}

data_sel = data_sel.replace(cleanup_lvl_edu)
data_sel['test preparation course'] = np.where(data_sel['test preparation course'] == 'completed', 1, 0)

test_preparation_course = data_sel.loc[:, data_sel.columns.isin(['test preparation course'])]
X = data_sel.loc[:, data_sel.columns.isin(['gender', 
                                          'parental level of education', 
                                          'math score'])]

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_validation, y_train, y_validation = train_test_split(X, test_preparation_course, test_size=.33, random_state=1)

T = DecisionTreeClassifier(random_state=241, max_depth=4)
T = T.fit(x_train, y_train)
data_sel.head()

Unnamed: 0,gender,parental level of education,test preparation course,math score
0,0,3,0,72
1,0,1,1,69
2,0,4,0,90
3,1,2,0,47
4,1,1,0,76


In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
#обычное решающее дерево, автоматический подбор удачных параметров дерева

grid_search_cv = GridSearchCV(cv=3, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, random_state=42,
            splitter='best'), n_jobs=None,
       param_grid={'max_depth': list(range(2, 30)), 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)
grid_search_cv.fit(x_train, y_train)
grid_search_cv.best_estimator_

Fitting 3 folds for each of 84 candidates, totalling 252 fits


In [10]:
from sklearn.model_selection import cross_val_score

print('precision:'+str(np.average(cross_val_score(grid_search_cv.best_estimator_, x_validation, y_validation, scoring='precision'))))
print('recall:'+str(np.average(cross_val_score(grid_search_cv.best_estimator_, x_validation, y_validation, scoring='recall'))))
print('f1:'+str(np.average(cross_val_score(grid_search_cv.best_estimator_, x_validation, y_validation, scoring='f1'))))

precision:0.36980769230769234
recall:0.23550724637681159
f1:0.2846880570409982


In [15]:
# Импорт необходимых библиотек
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score


param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Создание классификатора Случайный Лес
clf = RandomForestClassifier()

# Использование GridSearch для выбора лучших параметров
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy')
grid_search.fit(x_train, y_train.values.ravel())

# Получение лучших параметров
best_params = grid_search.best_params_
print("Лучшие параметры:", best_params)

# Создание классификатора с лучшими параметрами
best_clf = RandomForestClassifier(**best_params)

# Обучение классификатора на обучающей выборке с лучшими параметрами
best_clf.fit(x_train, y_train.values.ravel())

# Прогнозирование классов для тестовой выборки
y_pred = best_clf.predict(x_validation)

# Оценка качества классификатора
precision = precision_score(y_validation, y_pred)
recall = recall_score(y_validation, y_pred)
f1 = f1_score(y_validation, y_pred)

print("Метрики на тестовой выборке:")
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

# Лучшие параметры: {'max_depth': 50, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
# Метрики на тестовой выборке:
# Precision: 0.4142857142857143
# Recall: 0.24369747899159663
# F1: 0.3068783068783069

Лучшие параметры: {'max_depth': 50, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Метрики на тестовой выборке:
Precision: 0.4142857142857143
Recall: 0.24369747899159663
F1: 0.3068783068783069
