In [6]:
import pandas as pd
import numpy as np

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import SVC

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

import pprint
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import accuracy_score

sms = pd.read_table('Dane/sms.tsv', header=None, names=['label', 'message'])
sms['label'] = sms.label.map({'ham':0, 'spam': 1})
X = sms.message
y = sms.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000)

names = np.array(["Naiwny Bayes", "Drzewo decyzyjne", "Regresja logistyczna", "SVM",
                 "BaggingClassifier_tree", "BaggingClassifier_bayes", "RandomForest"])

models = [[("model", MultinomialNB())],
         [("model", DecisionTreeClassifier())],
         [("scaler", MaxAbsScaler()), ("model", LogisticRegression())],
         [("scaler", MaxAbsScaler()),("model", SVC())],
         [("model", BaggingClassifier(base_estimator=DecisionTreeClassifier()))],
         [("model", BaggingClassifier(base_estimator=MultinomialNB()))],
         [("model", RandomForestClassifier())]
         ]

param_grids = [{"model__alpha": [0.1, 1, 10], "model__fit_prior": [False, True]},
               {"model__criterion": ["gini", "entropy"], "model__min_samples_split": [2, 10, 100], "model__max_depth": [None, 2, 10, 100]}, 
               {"model__penalty": ["l1", "l2"], "model__C": [0.1, 1, 10]},
              [{"model__kernel": ["rbf"], "model__gamma": [0.1, 1]},
              {"model__kernel": ["poly"], "model__degree": [2, 3]}],
               {"model__n_estimators" : [2, 5, 100], "model__max_features": [0.5, 0.8, 1.0]},
               {"model__n_estimators" : [2, 5, 100], "model__max_features": [0.5, 0.8, 1.0]},
               {"model__n_estimators" : [2, 5, 100]}
              ]

uses = np.array([True, False, False, False, True, True, True])

if len(names) != len(models) or len(models) != len(param_grids) or len(param_grids) != len(uses):
    print(f"len(names): {len(names)}")
    print(f"len(models): {len(models)}")
    print(f"len(param_grids): {len(param_grids)}")
    print(f"len(uses): {len(uses)}")
    raise ValueError("Listy nie mają tej samej długości!")

vectorizer = [("vectorizer", CountVectorizer(stop_words="english", max_features=3000))]

best_models = []

for use, name, pipe, params in zip(uses, names, models, param_grids):
    if not use:
        continue
    print(f"Tunuje model: {name}")
    pipeline = Pipeline(vectorizer+pipe)
    gs = GridSearchCV(estimator=pipeline, param_grid=params, n_jobs=3)
    gs.fit(X_train, y_train)
    for mean, std, param, fit_time, score_time in zip(gs.cv_results_["mean_test_score"],
                                gs.cv_results_["std_test_score"],
                                gs.cv_results_["params"],
                                gs.cv_results_["mean_fit_time"],
                                gs.cv_results_["mean_score_time"]):
        print(f"{param}:\n mean: {np.round(mean, 4)}, std: {np.round(std,4)},\n fit_time: {np.round(fit_time, 4)}, score_time: {np.round(score_time,4)}\n")
    best_models.append(gs.best_estimator_)

best_models = np.array(best_models)
    
print("\n\nTestuję:")
for name, best_model in zip(names[uses], best_models):
    print(f"{name}: {accuracy_score(best_model.predict(X_test), y_test)}")

Tunuje model: Naiwny Bayes
{'model__alpha': 0.1, 'model__fit_prior': False}:
 mean: 0.9705, std: 0.0023,
 fit_time: 0.1209, score_time: 0.0683

{'model__alpha': 0.1, 'model__fit_prior': True}:
 mean: 0.9836, std: 0.0011,
 fit_time: 0.1081, score_time: 0.0561

{'model__alpha': 1, 'model__fit_prior': False}:
 mean: 0.9639, std: 0.0023,
 fit_time: 0.1184, score_time: 0.0501

{'model__alpha': 1, 'model__fit_prior': True}:
 mean: 0.9843, std: 0.0028,
 fit_time: 0.1292, score_time: 0.0433

{'model__alpha': 10, 'model__fit_prior': False}:
 mean: 0.956, std: 0.0067,
 fit_time: 0.1182, score_time: 0.0436

{'model__alpha': 10, 'model__fit_prior': True}:
 mean: 0.9786, std: 0.0041,
 fit_time: 0.1139, score_time: 0.044

Tunuje model: BaggingClassifier_tree
{'model__max_features': 0.5, 'model__n_estimators': 2}:
 mean: 0.9541, std: 0.0021,
 fit_time: 0.3602, score_time: 0.046

{'model__max_features': 0.5, 'model__n_estimators': 5}:
 mean: 0.9718, std: 0.004,
 fit_time: 0.7593, score_time: 0.0759

{