In [11]:
import pathlib
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import json


df_train = pd.read_csv(pathlib.Path("../data/interim/bank_train.csv"))
df_test = pd.read_csv(pathlib.Path("../data/interim/bank_test.csv"))


numeric_features = [
    'age',
    'balance',
    'day',
    'campaign',
    'pdays',
    'previous',
]

categorical_features = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'month',
    'campaign',
    'pdays',
    'previous',
]


#skirtingi pipelinai skirtingiems pozymiems
numeric_transformer_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), #impute -> nezinomas reiksmes panaikina idedamas mediana (stulpelio?)
    ('scaler', StandardScaler())])

categorical_transformer_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), #impute -> nezinomas reiksmes panaikina idedami konstanta, nes kategoriniai kintamieji
    ('onehot', OneHotEncoder(handle_unknown='ignore'))]) #ka negautume klaidos 'ignore'

#apdorojimo pipelinas
preprocessor_pipe = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_pipe, numeric_features),
        ('cat', categorical_transformer_pipe, categorical_features)])


X_train = df_train.drop('target', axis=1)
y_train = df_train['target']

X_test = df_test.drop('target', axis=1)
y_test = df_test['target']



clf = Pipeline(steps=[
    ('preprocessor', preprocessor_pipe),
    ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=100))]) #100 medziu

clf.fit(X_train, y_train)
#turim apmokyta pipelina



#10
param_grid = {
    'classifier__n_estimators': [5, 10, 30, 50, 100], #lab 3 praplesti parametru gardele
    'classifier__max_depth': [3, 7, 10, 5], #__ u apatiniai bruksneliai sckitlearn kazkas
}

grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

#11
accuracy = ("model accuracy: {:.3f}".format(metrics.accuracy_score(y_test, grid_search.predict(X_test))))

precision = ("model precision: {:.3f}".format(metrics.precision_score(y_test, grid_search.predict(X_test))))

recall = ("model recall: {:.3f}".format(metrics.recall_score(y_test, grid_search.predict(X_test))))

f1 = ("model F1: {:.3f}".format(metrics.f1_score(y_test, grid_search.predict(X_test))))

AuROC = ("model AuROC: {:.3f}".format(metrics.roc_auc_score(y_test, grid_search.predict(X_test))))

metrics = [accuracy, precision, recall, f1, AuROC]

print(metrics)

with open('metrics.json', 'w') as f:
    json.dump(metrics, f)
    
cv_results = {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in grid_search.cv_results_.items()}

json.dumps(cv_results, indent=4)

with open('data.json', 'w') as f:
    json.dump(cv_results, f)

In [2]:
cv_results = {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in grid_search.cv_results_.items()}
cv_results

{'mean_fit_time': [0.30994420051574706,
  0.31385581493377684,
  0.369889497756958,
  0.5396544456481933,
  0.7931793212890625,
  0.3442150831222534,
  0.32898476123809817,
  0.5121245622634888,
  0.643914771080017,
  1.166583514213562,
  0.3872664928436279,
  0.35983474254608155,
  0.6748606204986572,
  0.9550361633300781,
  1.7876646041870117,
  0.44434094429016113,
  0.32131693363189695,
  0.39527115821838377,
  0.5837795734405518,
  0.9265502452850342],
 'std_fit_time': [0.005081527753952212,
  0.009614614213546892,
  0.028493760879328717,
  0.02407032301009455,
  0.058275565491222384,
  0.07460889440425227,
  0.013292685002816683,
  0.03629670019229629,
  0.04951847941671331,
  0.09234933032408549,
  0.13335275911951425,
  0.027660788396468714,
  0.06965170983590238,
  0.1066209479972248,
  0.322625971258905,
  0.2786624093399402,
  0.0135588815781254,
  0.035210246406538165,
  0.05213594422918728,
  0.12683313117394515],
 'mean_score_time': [0.12375040054321289,
  0.1241773843765

In [5]:
# json.dumps converts an object into JSON string, while json.dump writes it to a file
print(json.dumps(cv_results, indent=4))


with open('data.json', 'w') as f:
    json.dump(cv_results, f)

{
    "mean_fit_time": [
        0.30994420051574706,
        0.31385581493377684,
        0.369889497756958,
        0.5396544456481933,
        0.7931793212890625,
        0.3442150831222534,
        0.32898476123809817,
        0.5121245622634888,
        0.643914771080017,
        1.166583514213562,
        0.3872664928436279,
        0.35983474254608155,
        0.6748606204986572,
        0.9550361633300781,
        1.7876646041870117,
        0.44434094429016113,
        0.32131693363189695,
        0.39527115821838377,
        0.5837795734405518,
        0.9265502452850342
    ],
    "std_fit_time": [
        0.005081527753952212,
        0.009614614213546892,
        0.028493760879328717,
        0.02407032301009455,
        0.058275565491222384,
        0.07460889440425227,
        0.013292685002816683,
        0.03629670019229629,
        0.04951847941671331,
        0.09234933032408549,
        0.13335275911951425,
        0.027660788396468714,
        0.06965170983590238,
