In [1]:
import pathlib

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics


In [2]:
df_train = pd.read_csv(pathlib.Path("../data/interim/bank_train.csv"))
df_test = pd.read_csv(pathlib.Path("../data/interim/bank_test.csv"))

In [3]:
list(df_train.columns)

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'target']

In [4]:
df_train.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
target        int64
dtype: object

In [5]:
numeric_features = [
    'age',
    'balance',
    'day',
    'campaign',
    'pdays',
    'previous',
]

In [6]:
categorical_features = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'month',
    'campaign',
    'pdays',
    'previous',
]

In [7]:
#skirtingi pipelinai skirtingiems pozymiems
numeric_transformer_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), #impute -> nezinomas reiksmes panaikina idedamas mediana (stulpelio?)
    ('scaler', StandardScaler())])

categorical_transformer_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), #impute -> nezinomas reiksmes panaikina idedami konstanta, nes kategoriniai kintamieji
    ('onehot', OneHotEncoder(handle_unknown='ignore'))]) #ka negautume klaidos 'ignore'

#apdorojimo pipelinas
preprocessor_pipe = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_pipe, numeric_features),
        ('cat', categorical_transformer_pipe, categorical_features)])

In [13]:
X_train = df_train.drop('target', axis=1)
y_train = df_train['target']

X_test = df_test.drop('target', axis=1)
y_test = df_test['target']

In [15]:
#issikvieciam paramettrus
#help(RandomForestClassifier)

Help on class RandomForestClassifier in module sklearn.ensemble.forest:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None)
 |  
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is always the same as the original
 |  input sample size but the samples are drawn with replacement if
 |  `bootstrap=True` (default).
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimators

In [14]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor_pipe),
    ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=100))]) #100 medziu

clf.fit(X_train, y_train)
#turim apmokyta pipelina

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [16]:
#accuracy
print("model score: {:.3f}".format(clf.score(X_test, y_test)))

model score: 0.887


In [15]:
#accuracy
print("model accuracy: {:.3f}".format(metrics.accuracy_score(y_test, clf.predict(X_test))))

print("model precision: {:.3f}".format(metrics.precision_score(y_test, clf.predict(X_test))))

print("model recall: {:.3f}".format(metrics.recall_score(y_test, clf.predict(X_test))))

print("model F1: {:.3f}".format(metrics.f1_score(y_test, clf.predict(X_test))))

print("model AuROC: {:.3f}".format(metrics.roc_auc_score(y_test, clf.predict(X_test))))

model accuracy: 0.888
model precision: 0.578
model recall: 0.145
model F1: 0.232
model AuROC: 0.565


In [16]:
print(metrics.classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      5989
           1       0.58      0.15      0.23       793

    accuracy                           0.89      6782
   macro avg       0.74      0.57      0.59      6782
weighted avg       0.86      0.89      0.86      6782



In [22]:
#10
param_grid = {
    #'processing__num__imputer__strategy': ["median", "mean"], 
    'classifier__n_estimators': [5, 10, 30, 50, 100], #lab 3 praplesti parametru gardele
    'classifier__max_depth': [3, 7, 10, 5], #__ u apatiniai bruksneliai sckitlearn kazkas
}

grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                   

In [23]:
#11
print("model accuracy: {:.3f}".format(metrics.accuracy_score(y_test, grid_search.predict(X_test))))

print("model precision: {:.3f}".format(metrics.precision_score(y_test, grid_search.predict(X_test))))

print("model recall: {:.3f}".format(metrics.recall_score(y_test, grid_search.predict(X_test))))

print("model F1: {:.3f}".format(metrics.f1_score(y_test, grid_search.predict(X_test))))

print("model AuROC: {:.3f}".format(metrics.roc_auc_score(y_test, grid_search.predict(X_test))))

model accuracy: 0.883
model precision: 0.000
model recall: 0.000
model F1: 0.000
model AuROC: 0.500


In [21]:
# TODO: upsample / downnsample