In [1]:
import numpy as np
import pandas as pd
import os
from pprint import pprint
import sklearn.pipeline
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import expon, uniform
import string


In [2]:

def get_all_data_from_dir(dirname: str) -> tuple[list[str], list[str], list[str]]:
    # x_train = load_data_from_file(os.path.join(dirname, "x_train.csv"), "text")
    # y_train = load_data_from_file(os.path.join(dirname, "y_train.csv"), "is_positive_sentiment")
    # x_test = load_data_from_file(os.path.join(dirname, "x_test.csv"), "text")
    x_train = load_data_from_file("x_train.csv", "text")
    y_train = load_data_from_file("y_train.csv", "is_positive_sentiment")
    x_test = load_data_from_file(os.path.join(dirname, "x_test.csv"), "text")
    return remove_punctuation(x_train), y_train, remove_punctuation(x_test)

def remove_punctuation(strings: list[str]) -> list[str]:
    return [s.translate(str.maketrans('', '', string.punctuation)) for s in strings]

def load_data_from_file(filename: str, col: str) -> list[str]:
    csv_data = pd.read_csv(filename)
    # pprint(csv_data)
    list_of_sentences = csv_data[col].values.tolist()
    return list_of_sentences


In [27]:
x, y, x_test = get_all_data_from_dir("data_reviews")
x = np.array(x)
y = np.array(y, dtype=np.int32)
x_test = np.array(x_test)

pipeline = sklearn.pipeline.Pipeline([
    ("bow_feature_extractor", CountVectorizer(ngram_range=(1,1))),
    ("classifier", sklearn.linear_model.LogisticRegression(max_iter=2000)),
])

distributions = {
    "classifier__C": np.logspace(-5, 5, 50), 
    ""
    "bow_feature_extractor__min_df": np.arange(0, 10, 1), 
    "bow_feature_extractor__max_df": np.arange(0.9, 1.0, 0.01),
    "bow_feature_extractor__stop_words": ["english", None],

}

clf = sklearn.model_selection.RandomizedSearchCV(pipeline, distributions, n_iter=1000, verbose=3, n_jobs=-1)

clf.fit(x, y)


[CV 1/5] END bow_feature_extractor__max_df=0.9600000000000001, bow_feature_extractor__min_df=1, bow_feature_extractor__stop_words=english, classifier__C=0.004498432668969444;, score=0.781 total time=   0.0s
[CV 3/5] END bow_feature_extractor__max_df=0.92, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=None, classifier__C=1.2648552168552958;, score=0.750 total time=   0.1s
[CV 3/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=english, classifier__C=0.0006866488450042998;, score=0.596 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.9600000000000001, bow_feature_extractor__min_df=1, bow_feature_extractor__stop_words=english, classifier__C=0.004498432668969444;, score=0.679 total time=   0.0s
[CV 1/5] END bow_feature_extractor__max_df=0.9700000000000001, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=None, classifier__C=3.2374575428176464;, score=0.829 total time=   0.1s
[C

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=6, bow_feature_extractor__stop_words=None, classifier__C=0.004498432668969444;, score=0.692 total time=   0.0s
[CV 1/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=0.30888435964774846;, score=0.785 total time=   0.0s
[CV 3/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=0.30888435964774846;, score=0.723 total time=   0.0s
[CV 2/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=None, classifier__C=13.257113655901108;, score=0.806 total time=   0.1s
[CV 5/5] END bow_feature_extractor__max_df=0.9800000000000001, bow_feature_extractor__min_df=1, bow_feature_extractor__stop_words=None, classifier__C=54.286754393238596;, score=0.840 total

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END bow_feature_extractor__max_df=0.9600000000000001, bow_feature_extractor__min_df=3, bow_feature_extractor__stop_words=english, classifier__C=0.018420699693267165;, score=0.704 total time=   0.0s
[CV 2/5] END bow_feature_extractor__max_df=0.9800000000000001, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=english, classifier__C=2329.951810515372;, score=0.706 total time=   0.1s
[CV 2/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=None, classifier__C=0.018420699693267165;, score=0.738 total time=   0.0s
[CV 2/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=english, classifier__C=910.2981779915227;, score=0.735 total time=   0.0s
[CV 5/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=english, classifier__C=910.2981779915227;, score=0.708 total time=   0.1s
[CV 4/5] END bow_feat

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=1, bow_feature_extractor__stop_words=None, classifier__C=0.30888435964774846;, score=0.819 total time=   0.1s
[CV 5/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=5963.623316594637;, score=0.727 total time=   0.2s
[CV 5/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=None, classifier__C=3.2374575428176464;, score=0.821 total time=   0.1s
[CV 1/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=5963.623316594637;, score=0.750 total time=   0.2s
[CV 1/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=None, classifier__C=3.2374575428176464;, score=0.829 total time=   0.1s
[CV 2/5] END bow_feature_extractor__max_df=0.9, b

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END bow_feature_extractor__max_df=0.9700000000000001, bow_feature_extractor__min_df=8, bow_feature_extractor__stop_words=english, classifier__C=138.9495494373139;, score=0.698 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=english, classifier__C=3727.593720314938;, score=0.713 total time=   0.2s
[CV 3/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=None, classifier__C=0.001757510624854793;, score=0.642 total time=   0.0s
[CV 1/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=3.2374575428176464;, score=0.767 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=3.2374575428176464;, score=0.733 total time=   0.1s
[CV 4/5] END bow_feature_ex

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END bow_feature_extractor__max_df=0.92, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=english, classifier__C=6.55128556859551e-05;, score=0.754 total time=   0.0s
[CV 1/5] END bow_feature_extractor__max_df=0.9600000000000001, bow_feature_extractor__min_df=6, bow_feature_extractor__stop_words=None, classifier__C=5.1794746792312125;, score=0.790 total time=   0.0s
[CV 1/5] END bow_feature_extractor__max_df=0.9700000000000001, bow_feature_extractor__min_df=2, bow_feature_extractor__stop_words=None, classifier__C=0.004498432668969444;, score=0.715 total time=   0.0s
[CV 3/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=5, bow_feature_extractor__stop_words=None, classifier__C=0.04714866363457394;, score=0.719 total time=   0.0s
[CV 2/5] END bow_feature_extractor__max_df=0.9600000000000001, bow_feature_extractor__min_df=6, bow_feature_extractor__stop_words=None, classifier__C=5.1794746792312125;, score=0.771 total time=   0.0s
[CV 5/5] E

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=None, classifier__C=2329.951810515372;, score=0.706 total time=   0.2s
[CV 4/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=8, bow_feature_extractor__stop_words=None, classifier__C=62505.51925273976;, score=0.704 total time=   0.2s
[CV 3/5] END bow_feature_extractor__max_df=0.9400000000000001, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=None, classifier__C=3727.593720314938;, score=0.677 total time=   0.1s
[CV 1/5] END bow_feature_extractor__max_df=0.9400000000000001, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=15264.179671752365;, score=0.748 total time=   0.2s
[CV 2/5] END bow_feature_extractor__max_df=0.9400000000000001, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=15264.179671752365;, score=0.719 total time=   0.3s
[CV 1/5]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END bow_feature_extractor__max_df=0.9800000000000001, bow_feature_extractor__min_df=3, bow_feature_extractor__stop_words=english, classifier__C=62505.51925273976;, score=0.690 total time=   0.5s
[CV 3/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=english, classifier__C=1e-05;, score=0.721 total time=   0.0s
[CV 1/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=5, bow_feature_extractor__stop_words=english, classifier__C=62505.51925273976;, score=0.742 total time=   0.4s
[CV 4/5] END bow_feature_extractor__max_df=0.9800000000000001, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=222.29964825261956;, score=0.698 total time=   0.1s
[CV 4/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=english, classifier__C=1e-05;, score=0.752 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.92,

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=english, classifier__C=2.0235896477251556;, score=0.783 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=english, classifier__C=3727.593720314938;, score=0.740 total time=   0.2s
[CV 1/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=None, classifier__C=222.29964825261956;, score=0.744 total time=   0.1s
[CV 1/5] END bow_feature_extractor__max_df=0.92, bow_feature_extractor__min_df=2, bow_feature_extractor__stop_words=english, classifier__C=0.0006866488450042998;, score=0.773 total time=   0.0s
[CV 5/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=english, classifier__C=2.0235896477251556;, score=0.808 total time=   0.0s
[CV 4/5] END bow_fe

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END bow_feature_extractor__max_df=0.92, bow_feature_extractor__min_df=2, bow_feature_extractor__stop_words=english, classifier__C=0.00010481131341546853;, score=0.773 total time=   0.0s
[CV 3/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=3, bow_feature_extractor__stop_words=None, classifier__C=0.49417133613238384;, score=0.750 total time=   0.0s
[CV 5/5] END bow_feature_extractor__max_df=0.92, bow_feature_extractor__min_df=2, bow_feature_extractor__stop_words=english, classifier__C=0.00010481131341546853;, score=0.719 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=8, bow_feature_extractor__stop_words=english, classifier__C=0.002811768697974231;, score=0.654 total time=   0.0s
[CV 5/5] END bow_feature_extractor__max_df=0.92, bow_feature_extractor__min_df=2, bow_feature_extractor__stop_words=None, classifier__C=0.1206792640639329;, score=0.804 total time=   0.0s
[CV 2/5] END bow_feature_e

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=english, classifier__C=1456.3484775012444;, score=0.721 total time=   0.1s
[CV 4/5] END bow_feature_extractor__max_df=0.9800000000000001, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=english, classifier__C=0.00016768329368110083;, score=0.650 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.92, bow_feature_extractor__min_df=5, bow_feature_extractor__stop_words=english, classifier__C=21.209508879201927;, score=0.731 total time=   0.0s
[CV 2/5] END bow_feature_extractor__max_df=0.92, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=None, classifier__C=15264.179671752365;, score=0.792 total time=   0.1s
[CV 2/5] END bow_feature_extractor__max_df=0.92, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=english, classifier__C=0.0002682695795279727;, score=0.698 total time=   0.0s
[CV 5/5] END bow_feature_ex

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=None, classifier__C=0.00016768329368110083;, score=0.617 total time=   0.0s
[CV 3/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=2, bow_feature_extractor__stop_words=english, classifier__C=86.85113737513521;, score=0.713 total time=   0.1s
[CV 5/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=None, classifier__C=2329.951810515372;, score=0.706 total time=   0.3s
[CV 3/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=english, classifier__C=0.30888435964774846;, score=0.756 total time=   0.0s
[CV 3/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=54.286754393238596;, score=0.694 total time=   0.1s
[CV 3/5] END bow_featur

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END bow_feature_extractor__max_df=0.92, bow_feature_extractor__min_df=8, bow_feature_extractor__stop_words=None, classifier__C=568.9866029018305;, score=0.723 total time=   0.1s
[CV 3/5] END bow_feature_extractor__max_df=0.9600000000000001, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=None, classifier__C=0.7906043210907702;, score=0.750 total time=   0.1s
[CV 3/5] END bow_feature_extractor__max_df=0.9700000000000001, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=english, classifier__C=3727.593720314938;, score=0.740 total time=   0.3s
[CV 4/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=english, classifier__C=568.9866029018305;, score=0.723 total time=   0.1s
[CV 4/5] END bow_feature_extractor__max_df=0.9400000000000001, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=None, classifier__C=33.9322177189533;, score=0.762 total time=   0.1s
[CV 1/5] END bow_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END bow_feature_extractor__max_df=0.9700000000000001, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=english, classifier__C=3727.593720314938;, score=0.740 total time=   0.4s
[CV 4/5] END bow_feature_extractor__max_df=0.9800000000000001, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=None, classifier__C=0.00042919342601287783;, score=0.660 total time=   0.0s
[CV 1/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=6, bow_feature_extractor__stop_words=english, classifier__C=0.029470517025518096;, score=0.787 total time=   0.0s
[CV 5/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=6, bow_feature_extractor__stop_words=english, classifier__C=0.029470517025518096;, score=0.760 total time=   0.0s
[CV 2/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=6, bow_feature_extractor__stop_words=english, classifier__C=0.029470517025518096;, score=0.729 total time=   0.0s
[CV 4/5] EN

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=english, classifier__C=0.00042919342601287783;, score=0.713 total time=   0.0s
[CV 2/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=8, bow_feature_extractor__stop_words=None, classifier__C=0.19306977288832497;, score=0.754 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=5, bow_feature_extractor__stop_words=None, classifier__C=33.9322177189533;, score=0.727 total time=   0.1s
[CV 3/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=None, classifier__C=33.9322177189533;, score=0.679 total time=   0.1s
[CV 3/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=8, bow_feature_extractor__stop_words=None, classifier__C=0.19306977288832497;, score=0.717 total time=   0.0s
[CV 4/5] END bow_feature_extr

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=5, bow_feature_extractor__stop_words=None, classifier__C=222.29964825261956;, score=0.752 total time=   0.2s
[CV 2/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=8, bow_feature_extractor__stop_words=None, classifier__C=13.257113655901108;, score=0.738 total time=   0.1s
[CV 5/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=2, bow_feature_extractor__stop_words=None, classifier__C=355.64803062231283;, score=0.769 total time=   0.1s
[CV 2/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=2, bow_feature_extractor__stop_words=None, classifier__C=9540.954763499964;, score=0.762 total time=   0.1s
[CV 3/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=8, bow_feature_extractor__stop_words=None, classifier__C=13.257113655901108;, score=0.692 total time=   0.0s
[CV 1/5] END bow_feature_extracto

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=1, bow_feature_extractor__stop_words=None, classifier__C=4.094915062380427e-05;, score=0.669 total time=   0.0s
[CV 5/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=1, bow_feature_extractor__stop_words=None, classifier__C=4.094915062380427e-05;, score=0.677 total time=   0.0s
[CV 2/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=None, classifier__C=0.19306977288832497;, score=0.767 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=english, classifier__C=0.1206792640639329;, score=0.769 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=4, bow_feature_extractor__stop_words=english, classifier__C=0.00010481131341546853;, score=0.658 total time=   0.0s
[CV 4/5] END bo

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=english, classifier__C=0.30888435964774846;, score=0.710 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=english, classifier__C=0.30888435964774846;, score=0.744 total time=   0.0s
[CV 5/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=7, bow_feature_extractor__stop_words=english, classifier__C=0.30888435964774846;, score=0.769 total time=   0.0s
[CV 3/5] END bow_feature_extractor__max_df=0.9700000000000001, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=None, classifier__C=1456.3484775012444;, score=0.677 total time=   0.1s
[CV 1/5] END bow_feature_extractor__max_df=0.9700000000000001, bow_feature_extractor__min_df=9, bow_feature_extractor__stop_words=None, classifier__C=1456.3484775012444;, score=0.7

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END bow_feature_extractor__max_df=0.9400000000000001, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=None, classifier__C=13.257113655901108;, score=0.744 total time=   0.1s
[CV 3/5] END bow_feature_extractor__max_df=0.9400000000000001, bow_feature_extractor__min_df=5, bow_feature_extractor__stop_words=english, classifier__C=13.257113655901108;, score=0.735 total time=   0.1s
[CV 2/5] END bow_feature_extractor__max_df=0.9400000000000001, bow_feature_extractor__min_df=5, bow_feature_extractor__stop_words=english, classifier__C=13.257113655901108;, score=0.758 total time=   0.0s
[CV 1/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=3, bow_feature_extractor__stop_words=english, classifier__C=5.1794746792312125;, score=0.787 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=5, bow_feature_extractor__stop_words=None, classifier__C=24420.53094548655;, score=0.681 total time=   0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=3, bow_feature_extractor__stop_words=None, classifier__C=13.257113655901108;, score=0.794 total time=   0.1s
[CV 4/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=5, bow_feature_extractor__stop_words=english, classifier__C=39069.39937054621;, score=0.717 total time=   0.3s
[CV 2/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=1, bow_feature_extractor__stop_words=None, classifier__C=1456.3484775012444;, score=0.798 total time=   0.1s
[CV 5/5] END bow_feature_extractor__max_df=0.9900000000000001, bow_feature_extractor__min_df=1, bow_feature_extractor__stop_words=None, classifier__C=1456.3484775012444;, score=0.835 total time=   0.1s
[CV 2/5] END bow_feature_extractor__max_df=0.9500000000000001, bow_feature_extractor__min_df=3, bow_feature_extractor__stop_words=None, classifier__C=13.257113655901108;, score=0.771 total t

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END bow_feature_extractor__max_df=0.91, bow_feature_extractor__min_df=3, bow_feature_extractor__stop_words=english, classifier__C=0.07543120063354623;, score=0.777 total time=   0.0s
[CV 4/5] END bow_feature_extractor__max_df=0.9, bow_feature_extractor__min_df=2, bow_feature_extractor__stop_words=english, classifier__C=54.286754393238596;, score=0.731 total time=   0.1s
[CV 4/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=english, classifier__C=2.559547922699533e-05;, score=0.765 total time=   0.0s
[CV 1/5] END bow_feature_extractor__max_df=0.9700000000000001, bow_feature_extractor__min_df=8, bow_feature_extractor__stop_words=None, classifier__C=0.49417133613238384;, score=0.785 total time=   0.0s
[CV 1/5] END bow_feature_extractor__max_df=0.93, bow_feature_extractor__min_df=0, bow_feature_extractor__stop_words=english, classifier__C=2.559547922699533e-05;, score=0.773 total time=   0.0s
[CV 5/5] END bow_feature_ex

In [24]:
clf.cv_results_


{'mean_fit_time': array([0.01336861, 0.02166481, 0.02912226, 0.0551404 , 0.01830974,
        0.02175751, 0.02653065, 0.03383536, 0.01884718, 0.18024564,
        0.02685347, 0.02777772, 0.12739534, 0.12624445, 0.01746197,
        0.05155711, 0.15871725, 0.02540178, 0.08226271, 0.02279053,
        0.07172875, 0.06217818, 0.0192977 , 0.02053633, 0.02319713,
        0.04978147, 0.03409448, 0.02040963, 0.02700863, 0.21709943,
        0.44626465, 0.19553051, 0.14683952, 0.03373976, 0.20569458,
        0.02337003, 0.10150118, 0.11397142, 0.05901713, 0.27361588,
        0.09590478, 0.12972875, 0.42204952, 0.12547812, 0.05368643,
        0.02444024, 0.05627956, 0.01619382, 0.31960988, 0.13970609,
        0.15935254, 0.02245722, 0.03402472, 0.09389944, 0.01734223,
        0.01756921, 0.02008777, 0.12060866, 0.02068396, 0.02378678,
        0.03527279, 0.03748884, 0.02468019, 0.11902399, 0.02088361,
        0.07522187, 0.11572404, 0.01771102, 0.03686895, 0.02491183,
        0.01946673, 0.06863217,

In [29]:
results = clf.cv_results_
valid_scores = results['mean_test_score']
param_scores = list(zip(results['params'], valid_scores)) # allows us to map params to scores

# sort scores by BEST at top
param_scores.sort(key=lambda x: x[1], reverse=True)

for params, score in param_scores:
    print(f"Mean Validation Score: {score}, Parameters: {params}")


Mean Validation Score: 0.8020833333333334, Parameters: {'classifier__C': 2.0235896477251556, 'bow_feature_extractor__stop_words': None, 'bow_feature_extractor__min_df': 0, 'bow_feature_extractor__max_df': 0.93}
Mean Validation Score: 0.8020833333333334, Parameters: {'classifier__C': 2.0235896477251556, 'bow_feature_extractor__stop_words': None, 'bow_feature_extractor__min_df': 1, 'bow_feature_extractor__max_df': 0.9600000000000001}
Mean Validation Score: 0.8, Parameters: {'classifier__C': 1.2648552168552958, 'bow_feature_extractor__stop_words': None, 'bow_feature_extractor__min_df': 0, 'bow_feature_extractor__max_df': 0.91}
Mean Validation Score: 0.8, Parameters: {'classifier__C': 1.2648552168552958, 'bow_feature_extractor__stop_words': None, 'bow_feature_extractor__min_df': 0, 'bow_feature_extractor__max_df': 0.92}
Mean Validation Score: 0.8, Parameters: {'classifier__C': 1.2648552168552958, 'bow_feature_extractor__stop_words': None, 'bow_feature_extractor__min_df': 0, 'bow_feature_ex

In [28]:
from sklearn.metrics import roc_auc_score

# Access mean test scores and parameters from clf.cv_results_
mean_test_scores = clf.cv_results_['mean_test_score']
params = clf.cv_results_['params']

# Iterate through the results and compute AUROC for each parameter combination
auroc_scores = []
for mean_score, param in zip(mean_test_scores, params):
    auroc = roc_auc_score(y, clf.predict_proba(x)[:, 1])  
    auroc_scores.append((param, auroc))

# Sort the AUROC scores by the highest value
sorted_auroc_scores = sorted(auroc_scores, key=lambda x: x[1], reverse=True)

# Print the AUROC scores for each parameter combination
for param, auroc in sorted_auroc_scores:
    print(f"Parameters: {param}, AUROC: {auroc:.4f}")


Parameters: {'classifier__C': 4.094915062380427e-05, 'bow_feature_extractor__stop_words': None, 'bow_feature_extractor__min_df': 5, 'bow_feature_extractor__max_df': 0.9500000000000001}, AUROC: 0.9978
Parameters: {'classifier__C': 2329.951810515372, 'bow_feature_extractor__stop_words': None, 'bow_feature_extractor__min_df': 5, 'bow_feature_extractor__max_df': 0.9900000000000001}, AUROC: 0.9978
Parameters: {'classifier__C': 222.29964825261956, 'bow_feature_extractor__stop_words': 'english', 'bow_feature_extractor__min_df': 3, 'bow_feature_extractor__max_df': 0.9400000000000001}, AUROC: 0.9978
Parameters: {'classifier__C': 568.9866029018305, 'bow_feature_extractor__stop_words': None, 'bow_feature_extractor__min_df': 2, 'bow_feature_extractor__max_df': 0.9700000000000001}, AUROC: 0.9978
Parameters: {'classifier__C': 86.85113737513521, 'bow_feature_extractor__stop_words': None, 'bow_feature_extractor__min_df': 7, 'bow_feature_extractor__max_df': 0.9}, AUROC: 0.9978
Parameters: {'classifier_