# LogisticRegression

### Base imports

In [1]:
import os

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', 100)

import matplotlib
%matplotlib inline
from matplotlib import pylab as plt
import seaborn as sns

### ML imports

In [2]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import KFold, ShuffleSplit, StratifiedKFold

In [3]:
from sklearn.linear_model import LogisticRegression as estimator_class

### Local imports

In [4]:
from disease_ml.data import load_data
from disease_ml.utils import make_submission

### Constants

In [5]:
EXPERIMENT_TITLE = 'Logreg'

DO_GRID_SEARCH = False

SCORING = 'neg_log_loss'
CV = StratifiedKFold(n_splits=6, shuffle=True, random_state=0)

N_JOBS = 3

SUBMISSIONS_DIR = 'submissions/'

### Loading data

In [6]:
data_params = {
    'imputer_strategy': 'most_frequent',
    'scale': True,
    'ohe': ('category',),
    
    
#     'hashing_trick': True,
#     'binary': False,
#     'ngram_range': (1, 3),
    
    'feature_types': ('numeric', 'category', 'ordered_category')
}

x_train, y_train, x_test = load_data(**data_params)

In [7]:
print('x_train.shape={}'.format(x_train.shape))
print('y_train.shape={}'.format(y_train.shape))
print('x_test.shape={}'.format(x_test.shape))

x_train.shape=(4099, 1313)
y_train.shape=(4099,)
x_test.shape=(1366, 1313)


### Training

In [8]:
estimator_class()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
estimator_params = {
    'C': 0.1,
    'penalty': 'l1'
}

estimator_param_grid = {
    'C': [10 ** power for power in range(-3, 3 + 1)],
    'penalty': ['l1']
}

In [10]:
if DO_GRID_SEARCH:
    gs = GridSearchCV(estimator_class(), estimator_param_grid,
                      scoring=SCORING, cv=CV,
                      refit=True, n_jobs=N_JOBS, verbose=2)
    gs.fit(x_train, y_train)
    
    score = gs.best_score_
    estimator = gs.best_estimator_
    estimator_params = gs.best_params_
else:
    cv_scores = cross_val_score(estimator_class(**estimator_params),
                                x_train, y_train,
                                scoring=SCORING, cv=CV,
                                n_jobs=N_JOBS, verbose=2)
    score = np.mean(cv_scores)
    
    print('Score: {:.5} | Std: {:.5}'.format(score, np.std(cv_scores)))
    print(cv_scores)
    
    # refitting on all data
    estimator = estimator_class(**estimator_params)
    estimator.fit(x_train, y_train)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.8s
[CV]  ................................................................
[CV] ................................................. , total=   0.8s
[CV] ................................................. , total=   1.1s
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.4s
[CV] ................................................. , total=   0.7s
[CV] ................................................. , total=   0.5s


[Parallel(n_jobs=3)]: Done   6 out of   6 | elapsed:    2.3s finished


Score: -0.22291 | Std: 0.0087605
[-0.21566406 -0.21609174 -0.21227547 -0.23273212 -0.23488423 -0.22583247]


### Printing results of training

In [11]:
if DO_GRID_SEARCH:
    print('Grid search results:')
    cv_results_indexes_sorted = np.argsort(gs.cv_results_['mean_test_score'])[::-1]
    mean_test_scores = gs.cv_results_['mean_test_score'][cv_results_indexes_sorted]
    std_test_scores = gs.cv_results_['std_test_score'][cv_results_indexes_sorted]
    params = np.array(gs.cv_results_['params'])[cv_results_indexes_sorted]

    for test_score, std, params in zip(mean_test_scores, std_test_scores, params):
        print('- score={:.5}, std={:.5} | params={}'.format(test_score, std, params))

### Submitting results

In [12]:
params = {
    'experiment': {
        'title': EXPERIMENT_TITLE,
        'do_grid_search': DO_GRID_SEARCH,
        'scoring': SCORING,
        'k_folds': str(CV),
        'score': score,
        'n_jobs': N_JOBS
    },
    
    'data': data_params,
    'estimator': estimator.get_params()
}

predictions = estimator.predict_proba(x_test)[:, 1]
make_submission(predictions, SUBMISSIONS_DIR, EXPERIMENT_TITLE,
                estimator, params,
                score, add_to_blending=True)