In [1]:
from brfss_functions import load_brfss_data, split_brfss_dataset, \
    CustomAttributeDropper
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from basic_functions import display_scores

data = load_brfss_data()
train, train_labels, test, test_labels = split_brfss_dataset(data)

data_pipeline = Pipeline([
        ('attr_dropper', CustomAttributeDropper()),
        ('std_scaler', StandardScaler()),
    ])

train_prepared = data_pipeline.fit_transform(train)

In [2]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier()

sgd_scores = cross_val_score(
    estimator=sgd_clf,
    X=train_prepared,
    y=train_labels,
    scoring='balanced_accuracy',
    cv=3)

display_scores(sgd_scores)

Scores: [0.33333333 0.33333333 0.33333333]
Mean: 0.3333333333333333
Standard deviation: 0.0


In [3]:
from sklearn.svm import LinearSVC

lin_svc = LinearSVC(max_iter=10_000)

lin_svc_scores = cross_val_score(
    estimator=lin_svc,
    X=train_prepared,
    y=train_labels,
    scoring='balanced_accuracy',
    cv=3)

display_scores(lin_svc_scores)

Scores: [0.35995087 0.3603847  0.35696911]
Mean: 0.359101561239723
Standard deviation: 0.0015182348616734535


In [6]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)

forest_scores = cross_val_score(
    estimator=forest_clf,
    X=train_prepared,
    y=train_labels,
    scoring='balanced_accuracy',
    cv=3)

display_scores(forest_scores)

Scores: [0.39319569 0.39419173 0.39116487]
Mean: 0.39285076016749815
Standard deviation: 0.001259549699395266


In [8]:
from sklearn.model_selection import GridSearchCV

forest_param_grid = [
    {'n_estimators': [100, 200, 400]},
    {'bootstrap': [False], 'n_estimators': [100, 200, 400]},
]

forest_grid_search = GridSearchCV(
    estimator=forest_clf,
    param_grid=forest_param_grid,
    scoring='balanced_accuracy',
    cv=5,
    return_train_score=True)

forest_grid_search.fit(train_prepared, train_labels)

print(forest_grid_search.best_params_)


{'bootstrap': False, 'n_estimators': 400}
