In [None]:
from pprint import pprint
from time import time
import logging
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
import joblib
import numpy as np

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

pipeline = Pipeline([
    ('logreg', LogisticRegression(C=1))
])

parameters = {
    'logreg__C': (np.arange(0.0, 10.0, 0.01))
}

if __name__ == "__main__":


    test = joblib.load('./all_mp_2018_hashtags.allx')
    x = test.toarray()
    test_y = joblib.load('./all_health_topic_labels.y')
    y = list(test_y.values())

    useful_columns = []
    for i in range(0, x.shape[1]):
        result = np.where(x[:, i] > 0)
        if 20 < len(result[0]) < 500:
            useful_columns.append(i)

    useful_x = x[:, useful_columns]

    x_tfidf = TfidfTransformer().fit_transform(useful_x)

    grid_search = RandomizedSearchCV(pipeline, parameters, cv=5,
                                     n_jobs=4, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(x_tfidf.toarray(), y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
