In [1]:
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer, Imputer
from copy import deepcopy

In [4]:
from sklearn.impute import SimpleImputer

In [91]:
import warnings
warnings.filterwarnings("ignore")

# Перебор гиперпараметров модели

In [2]:
df = pd.read_csv('https://gist.githubusercontent.com/braingineer/5d15057ac482ee0130b6d0e6f9cc9311/raw/d4eefaecc98b342ec578cf3512184556e8856750/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
# собираем пайплайн с помощью sklearn_pandas
name_to_tfidf = Pipeline([ ('name_vect', CountVectorizer()) , ('name_tfidf', TfidfTransformer()) ])
ticket_to_tfidf = Pipeline([ ('ticket_vect', CountVectorizer()) , ('ticket_tfidf', TfidfTransformer()) ])
age_filling = SimpleImputer(strategy="constant", fill_value=0)

full_mapper = DataFrameMapper([
    ('Name', name_to_tfidf ),
    ('Ticket', ticket_to_tfidf ),
    ('Sex', LabelBinarizer()),
    (['Age'], age_filling),
    ('Fare', None),
    ])


In [42]:
# build full pipeline
full_pipeline  = Pipeline([
    ('mapper', full_mapper),
    ('clf', SGDClassifier(max_iter=15, warm_start=True))
])

    

In [43]:
full_params = {'clf__alpha': [1e-2,1e-3,1e-4],
               'clf__loss':['modified_huber','hinge'],
               'clf__penalty':['l2','l1'],
              }

In [44]:
gs_clf = GridSearchCV(full_pipeline, full_params, n_jobs=-1, cv=5)
gs_clf.fit(df, df['Survived'])



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('Name', Pipeline(memory=None,
     steps=[('name_vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=Tr...om_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=True))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'clf__alpha': [0.01, 0.001, 0.0001], 'clf__loss': ['modified_huber', 'hinge'], 'clf__penalty': ['l2', 'l1']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [46]:
print("Best score: %0.3f" % gs_clf.best_score_)
print("Best parameters set:")
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(full_params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.742
Best parameters set:
	clf__alpha: 0.01
	clf__loss: 'hinge'
	clf__penalty: 'l1'


# Перебор гиперпараметров с помощью hyperopt

In [36]:
import numpy as np

In [37]:
name_to_tfidf = Pipeline([ ('name_vect', CountVectorizer()) , ('name_tfidf', TfidfTransformer()) ])
ticket_to_tfidf = Pipeline([ ('ticket_vect', CountVectorizer()) , ('ticket_tfidf', TfidfTransformer()) ])
age_filling = SimpleImputer(strategy="constant", fill_value=0)

def get_mapper(params):
    return DataFrameMapper([
        ('Name', [CountVectorizer(analyzer=params["name_vect"]), TfidfTransformer()]),
        ('Ticket', ticket_to_tfidf ),
        ('Sex', LabelBinarizer()),
        (['Age'], age_filling),
        ('Fare', None),
    ])

In [38]:
from sklearn.model_selection import cross_val_score

def objective(params):
    pipeline = Pipeline([
        ('mapper', get_mapper(params)),
        ('clf', SGDClassifier(n_iter=15, warm_start=True))
    ])
    
    metric =np.mean(cross_val_score(pipeline, df, df["Survived"], cv=5))

    return {'loss': metric, 'status': STATUS_OK }

In [39]:
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

HYPEROPT_ALGO = tpe.suggest  #  tpe.suggest OR hyperopt.rand.suggest
N_HYPEROPT_PROBES = 20
trials = Trials()

space = {
    'clf__alpha': hp.uniform('lambda_l1', 1e-4, 1e-2 ),
    'clf__loss':hp.choice('clf__loss', ['modified_huber','hinge']),
    'clf__penalty':hp.choice('clf__penalty', ['l2','l1']),
    # параметры пайплайна
    'name_vect':hp.choice('name_tfidf', ['char', 'char_wb']),    
}


In [40]:
best = hyperopt.fmin(fn=objective, space=space,
                     algo=HYPEROPT_ALGO,
                     max_evals=N_HYPEROPT_PROBES,
                     trials=trials,
                     verbose=1)

  5%|▌         | 1/20 [00:00<00:03,  5.00it/s, best loss: 0.6835504596623787]








 10%|█         | 2/20 [00:00<00:03,  4.99it/s, best loss: 0.6455813151844448]









 15%|█▌        | 3/20 [00:00<00:03,  5.23it/s, best loss: 0.6186517634133358]









 20%|██        | 4/20 [00:00<00:03,  5.15it/s, best loss: 0.6186517634133358]








 25%|██▌       | 5/20 [00:00<00:02,  5.09it/s, best loss: 0.6186517634133358]









 30%|███       | 6/20 [00:01<00:02,  5.30it/s, best loss: 0.6186517634133358]









 35%|███▌      | 7/20 [00:01<00:02,  5.36it/s, best loss: 0.6186517634133358]








 40%|████      | 8/20 [00:01<00:02,  5.22it/s, best loss: 0.6186517634133358]








 45%|████▌     | 9/20 [00:01<00:02,  5.12it/s, best loss: 0.6186517634133358]








 55%|█████▌    | 11/20 [00:02<00:01,  5.26it/s, best loss: 0.602612715073869]









 60%|██████    | 12/20 [00:02<00:01,  5.17it/s, best loss: 0.602612715073869]








 65%|██████▌   | 13/20 [00:02<00:01,  5.10it/s, best loss: 0.602612715073869]








 70%|███████   | 14/20 [00:02<00:01,  5.06it/s, best loss: 0.602612715073869]








 75%|███████▌  | 15/20 [00:02<00:01,  4.84it/s, best loss: 0.602612715073869]









 80%|████████  | 16/20 [00:03<00:00,  4.88it/s, best loss: 0.602612715073869]









 85%|████████▌ | 17/20 [00:03<00:00,  4.91it/s, best loss: 0.602612715073869]









 90%|█████████ | 18/20 [00:03<00:00,  5.16it/s, best loss: 0.602612715073869]









100%|██████████| 20/20 [00:03<00:00,  5.47it/s, best loss: 0.602612715073869]










In [41]:
best

{'clf__loss': 0,
 'clf__penalty': 0,
 'lambda_l1': 0.004761311163494659,
 'name_tfidf': 1}

# Перебор гиперпараметров с помощью ParamGrid

In [3]:
from sklearn.model_selection import ParameterGrid
import json 
import operator

In [4]:
params_dict = ParameterGrid(
    {
        'clf': ParameterGrid({'alpha': [1e-2,1e-3,1e-4],}),
        'mapper': ParameterGrid({"name_vect": ParameterGrid({"analyzer":["char", "char_wb"]}),
                                 "ticket_vect": ParameterGrid({"analyzer":["char", "char_wb"]})
                                }),
    }
)

In [5]:
list(params_dict)[1]

{'clf': {'alpha': 0.01},
 'mapper': {'name_vect': {'analyzer': 'char'},
  'ticket_vect': {'analyzer': 'char_wb'}}}

In [87]:
def get_mapper(params):
    return DataFrameMapper([
        ('Name', [CountVectorizer(**params["name_vect"]), TfidfTransformer()]),
        ('Ticket', [CountVectorizer(**params["ticket_vect"]), TfidfTransformer()]),
        ('Sex', LabelBinarizer()),
        (['Age'], SimpleImputer(strategy="constant", fill_value=0)),
        ('Fare', None),
    ])

In [93]:
def objective(params):
    pipeline = Pipeline([
        ('mapper', get_mapper(params["mapper"])),
        ('clf', SGDClassifier(n_iter=15, warm_start=True, **params["clf"]))
    ])
    
    metric =np.mean(cross_val_score(pipeline, df, df["Survived"], cv=5))

    return metric

In [99]:
result = {}
for params in params_dict:
    scores = objective(params)
    result[json.dumps(params)] = np.mean(scores)
#     print(params)

In [103]:
print(max(result.items(), key=operator.itemgetter(1))[0])


{"clf": {"alpha": 0.001, "loss": "hinge"}, "mapper": {"name_vect": {"analyzer": "char"}, "ticket_vect": {"analyzer": "char_wb"}}}


### Сохраняем обученный пайплайн для сервиса

In [112]:
import pickle

In [107]:
best_params = json.loads(max(result.items(), key=operator.itemgetter(1))[0])

In [109]:
best_params["mapper"]

{'name_vect': {'analyzer': 'char'}, 'ticket_vect': {'analyzer': 'char_wb'}}

In [111]:
best_mapper = get_mapper(best_params["mapper"])
best_mapper.fit(df)

DataFrameMapper(default=False, df_out=False,
        features=[('Name', [CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        s...opy=True, fill_value=0, missing_values=nan,
       strategy='constant', verbose=0)), ('Fare', None)],
        input_df=False, sparse=False)

In [115]:
best_model = SGDClassifier(n_iter=15, warm_start=True, **best_params["clf"])
best_model.fit(best_mapper.transform(df), df["Survived"])

SGDClassifier(alpha=0.001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=15, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=True)

In [122]:
with open('artifacts/mapper.pkl', 'wb') as handle:
    pickle.dump(best_mapper, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('artifacts/model.pkl', 'wb') as handle:
    pickle.dump(best_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Время на процессинг

In [146]:
d = [df.loc[1].to_dict()]

In [149]:
%%timeit
pd.DataFrame(d)

1.06 ms ± 1.32 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [148]:
%%timeit
a = best_mapper.transform(pd.DataFrame(d))

3.59 ms ± 13.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
