In [1]:
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer, Imputer
from copy import deepcopy

In [2]:
from sklearn.impute import SimpleImputer

In [3]:
import warnings
warnings.filterwarnings("ignore")

# Перебор гиперпараметров модели

In [4]:
df = pd.read_csv('https://gist.githubusercontent.com/braingineer/5d15057ac482ee0130b6d0e6f9cc9311/raw/d4eefaecc98b342ec578cf3512184556e8856750/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
from sklearn.compose import ColumnTransformer

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [27]:
# собираем пайплайн с помощью ColumnTransformer
name_to_tfidf = Pipeline([ ('name_vect', CountVectorizer()) , ('name_tfidf', TfidfTransformer()) ])
ticket_to_tfidf = Pipeline([ ('ticket_vect', CountVectorizer()) , ('ticket_tfidf', TfidfTransformer()) ])
age_filling = SimpleImputer(strategy="constant", fill_value=0)

full_mapper = ColumnTransformer(transformers=[
    ('Name', name_to_tfidf, 'Name'),
    ('Ticket', ticket_to_tfidf, 'Ticket'),
#     ('Sex', LabelBinarizer(), 'Sex'), # doesn't work ((
    ('Sex', OneHotEncoder(), ['Sex']),
    ('Age', age_filling, ['Age']),
    ('Fare', 'passthrough', ['Fare']),
])


In [28]:
full_mapper.fit(df, df["Survived"])

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('Name',
                                 Pipeline(memory=None,
                                          steps=[('name_vect',
                                                  CountVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.int64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=1.0,
                                                    

In [10]:
a = full_mapper.transform(df)

In [11]:
a

<891x2208 sparse matrix of type '<class 'numpy.float64'>'
	with 7103 stored elements in Compressed Sparse Row format>

In [12]:
# build full pipeline
full_pipeline  = Pipeline([
    ('mapper', full_mapper),
    ('clf', SGDClassifier(max_iter=15, warm_start=True))
])

    

In [13]:
full_params = {'clf__alpha': [1e-2,1e-3,1e-4],
               'clf__loss':['modified_huber','hinge'],
               'clf__penalty':['l2','l1'],
              }

In [14]:
gs_clf = GridSearchCV(full_pipeline, full_params, n_jobs=-1, cv=5)
gs_clf.fit(df, df['Survived'])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('mapper',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('Name',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('name_vect',
                                                                                          CountVectorizer(analyzer='word',
                                                                                                          binary=False,
                    

In [15]:
print("Best score: %0.3f" % gs_clf.best_score_)
print("Best parameters set:")
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(full_params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.772
Best parameters set:
	clf__alpha: 0.01
	clf__loss: 'hinge'
	clf__penalty: 'l1'


# Перебор гиперпараметров с помощью hyperopt

In [16]:
import numpy as np

In [17]:
def get_mapper(params):
    name_to_tfidf = Pipeline([('name_vect', CountVectorizer(analyzer=params["name_vect"])),
                              ('name_tfidf', TfidfTransformer()) ])
    ticket_to_tfidf = Pipeline([ ('ticket_vect', CountVectorizer()) , ('ticket_tfidf', TfidfTransformer()) ])
    age_filling = SimpleImputer(strategy="constant", fill_value=0)    
    return ColumnTransformer(transformers=[
        ('Name', name_to_tfidf, 'Name'),
        ('Ticket', ticket_to_tfidf, 'Ticket'),
        ('Sex', OneHotEncoder(), ['Sex']),
        ('Age', age_filling, ['Age']),
        ('Fare', 'passthrough', ['Fare']),
    ])


In [21]:
from sklearn.model_selection import cross_val_score

def objective(params):
    pipeline = Pipeline([
        ('mapper', get_mapper(params)),
        ('clf', SGDClassifier(max_iter=15, warm_start=True))
    ])
    
    metric =np.mean(cross_val_score(pipeline, df, df["Survived"], cv=5))

    return {'loss': metric, 'status': STATUS_OK }

In [22]:
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

HYPEROPT_ALGO = tpe.suggest  #  tpe.suggest OR hyperopt.rand.suggest
N_HYPEROPT_PROBES = 20
trials = Trials()

space = {
    'clf__alpha': hp.uniform('lambda_l1', 1e-4, 1e-2 ),
    'clf__loss':hp.choice('clf__loss', ['modified_huber','hinge']),
    'clf__penalty':hp.choice('clf__penalty', ['l2','l1']),
    # параметры пайплайна
    'name_vect':hp.choice('name_tfidf', ['char', 'char_wb']),    
}


In [23]:
best = hyperopt.fmin(fn=objective, space=space,
                     algo=HYPEROPT_ALGO,
                     max_evals=N_HYPEROPT_PROBES,
                     trials=trials,
                     verbose=1)

100%|██████████| 20/20 [00:03<00:00,  6.41it/s, best loss: 0.572462955535294] 


In [24]:
best

{'clf__loss': 1,
 'clf__penalty': 0,
 'lambda_l1': 0.002126035974806336,
 'name_tfidf': 0}

# Перебор гиперпараметров с помощью ParamGrid

In [29]:
from sklearn.model_selection import ParameterGrid
import json 
import operator

In [30]:
params_dict = ParameterGrid(
    {
        'clf': ParameterGrid({'alpha': [1e-2,1e-3,1e-4],}),
        'mapper': ParameterGrid({"name_vect": ParameterGrid({"analyzer":["char", "char_wb"]}),
                                 "ticket_vect": ParameterGrid({"analyzer":["char", "char_wb"]})
                                }),
    }
)

In [31]:
list(params_dict)[1]

{'clf': {'alpha': 0.01},
 'mapper': {'name_vect': {'analyzer': 'char'},
  'ticket_vect': {'analyzer': 'char_wb'}}}

In [32]:
def get_mapper(params):
    name_to_tfidf = Pipeline([('name_vect', CountVectorizer(**params["name_vect"])),
                              ('name_tfidf', TfidfTransformer()) ])
    ticket_to_tfidf = Pipeline([('ticket_vect', CountVectorizer(**params["ticket_vect"])),
                                ('ticket_tfidf', TfidfTransformer()) ])
    age_filling = SimpleImputer(strategy="constant", fill_value=0)    
    return ColumnTransformer(transformers=[
        ('Name', name_to_tfidf, 'Name'),
        ('Ticket', ticket_to_tfidf, 'Ticket'),
        ('Sex', OneHotEncoder(), ['Sex']),
        ('Age', age_filling, ['Age']),
        ('Fare', 'passthrough', ['Fare']),
    ])


In [37]:
def objective(params):
    pipeline = Pipeline([
        ('mapper', get_mapper(params["mapper"])),
        ('clf', SGDClassifier(max_iter=15, warm_start=True, **params["clf"]))
    ])
    
    metric =np.mean(cross_val_score(pipeline, df, df["Survived"], cv=5))

    return metric

In [38]:
result = {}
for params in params_dict:
    scores = objective(params)
    result[json.dumps(params)] = np.mean(scores)
#     print(params)

In [39]:
print(max(result.items(), key=operator.itemgetter(1))[0])


{"clf": {"alpha": 0.01}, "mapper": {"name_vect": {"analyzer": "char_wb"}, "ticket_vect": {"analyzer": "char_wb"}}}


### Сохраняем обученный пайплайн для сервиса

In [40]:
import pickle

In [41]:
best_params = json.loads(max(result.items(), key=operator.itemgetter(1))[0])

In [42]:
best_params["mapper"]

{'name_vect': {'analyzer': 'char_wb'}, 'ticket_vect': {'analyzer': 'char_wb'}}

In [43]:
best_mapper = get_mapper(best_params["mapper"])
best_mapper.fit(df)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('Name',
                                 Pipeline(memory=None,
                                          steps=[('name_vect',
                                                  CountVectorizer(analyzer='char_wb',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.int64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=1.0,
                                                 

In [45]:
best_model = SGDClassifier(max_iter=15, warm_start=True, **best_params["clf"])
best_model.fit(best_mapper.transform(df), df["Survived"])

SGDClassifier(alpha=0.01, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=15,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=True)

In [47]:
with open('artifacts/mapper_sklearn.pkl', 'wb') as handle:
    pickle.dump(best_mapper, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('artifacts/model_sklearn.pkl', 'wb') as handle:
    pickle.dump(best_model, handle, protocol=pickle.HIGHEST_PROTOCOL)