# GridSearching TransformerWeights

In [48]:
from pipelines.alcohol import AlcoholPipeline

from data.dao import DataAccess, LabelGetter

from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV, ParameterGrid

In [68]:
XX = DataAccess.get_as_dataframe()
LL = LabelGetter(XX)

Create Pipeline!!!

In [110]:
initialized_pipeline = AlcoholPipeline(time_features=["hour", "dayofweek"])
pipeline = initialized_pipeline.pipeline(LogisticRegression(C=50))
pipeline.set_params(**{
        'features__text__tfidf__analyzer': 'char',
         'features__text__tfidf__lowercase': True,
         'features__text__tfidf__max_features': 93000,
         'features__text__tfidf__ngram_range': (1, 5),
         'features__text__tfidf__norm': 'l2'
    })

Pipeline(steps=[('exploder', ExplodingRecordJoiner(user=['created_at', 'favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'verified'])), ('features', FeatureUnion(n_jobs=1,
       transformer_list=[('time', Pipeline(steps=[('getter', ItemGetter(key='created_at')), ('to_datetimeindex', ...',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

Generate ParamGrid

In [115]:
weights = list(ParameterGrid(dict(
    time=[0,1,3],
    text=[1,3],
    age=[0,1,3],
    user=[0,1,3]
)))

In [116]:
param_grid = {"features__transformer_weights": weights}

In [121]:
clfs = []

### Alcohol

In [122]:
X, y = LL.get_alcohol()
clfs.append(("alcohol",
            GridSearchCV(pipeline, param_grid, verbose=1, n_jobs=4, scoring="f1").fit(X, y)
            ))

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 162 out of 162 | elapsed:  6.2min finished


### First Person

In [123]:
X, y = LL.get_first_person()
clfs.append(("first_person",
            GridSearchCV(pipeline, param_grid, verbose=1, n_jobs=4, scoring="f1").fit(X, y)
            ))

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   24.2s
[Parallel(n_jobs=4)]: Done 162 out of 162 | elapsed:  1.6min finished


### First Person Level

In [124]:
X, y = LL.get_first_person_label()
clfs.append(("first_person_level",
            GridSearchCV(pipeline, param_grid, verbose=1, n_jobs=4, scoring="f1_weighted").fit(X, y)
            ))

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   21.0s
[Parallel(n_jobs=4)]: Done 162 out of 162 | elapsed:  1.4min finished


In [158]:
sorted(clfs[0][1].grid_scores_, key=lambda _: _[1])

[mean: 0.81504, std: 0.01546, params: {'features__transformer_weights': {'time': 1, 'user': 3, 'age': 0, 'text': 3}},
 mean: 0.81513, std: 0.01618, params: {'features__transformer_weights': {'time': 3, 'user': 1, 'age': 0, 'text': 3}},
 mean: 0.81516, std: 0.01655, params: {'features__transformer_weights': {'time': 3, 'user': 3, 'age': 0, 'text': 3}},
 mean: 0.81520, std: 0.01448, params: {'features__transformer_weights': {'time': 1, 'user': 1, 'age': 0, 'text': 3}},
 mean: 0.81529, std: 0.01605, params: {'features__transformer_weights': {'time': 1, 'user': 3, 'age': 3, 'text': 3}},
 mean: 0.81542, std: 0.01586, params: {'features__transformer_weights': {'time': 1, 'user': 1, 'age': 3, 'text': 3}},
 mean: 0.81551, std: 0.01500, params: {'features__transformer_weights': {'time': 1, 'user': 1, 'age': 1, 'text': 3}},
 mean: 0.81551, std: 0.01621, params: {'features__transformer_weights': {'time': 3, 'user': 1, 'age': 3, 'text': 3}},
 mean: 0.81563, std: 0.01536, params: {'features__transf

In [157]:
sorted(clfs[1][1].grid_scores_, key=lambda _: _[1])

[mean: 0.65389, std: 0.02195, params: {'features__transformer_weights': {'time': 3, 'user': 3, 'age': 0, 'text': 1}},
 mean: 0.65426, std: 0.02088, params: {'features__transformer_weights': {'time': 3, 'user': 3, 'age': 3, 'text': 3}},
 mean: 0.65431, std: 0.02168, params: {'features__transformer_weights': {'time': 3, 'user': 1, 'age': 0, 'text': 1}},
 mean: 0.65437, std: 0.02162, params: {'features__transformer_weights': {'time': 3, 'user': 1, 'age': 0, 'text': 3}},
 mean: 0.65475, std: 0.02350, params: {'features__transformer_weights': {'time': 1, 'user': 3, 'age': 0, 'text': 1}},
 mean: 0.65613, std: 0.02380, params: {'features__transformer_weights': {'time': 1, 'user': 1, 'age': 0, 'text': 1}},
 mean: 0.65664, std: 0.02242, params: {'features__transformer_weights': {'time': 3, 'user': 3, 'age': 1, 'text': 3}},
 mean: 0.65715, std: 0.02023, params: {'features__transformer_weights': {'time': 1, 'user': 3, 'age': 3, 'text': 1}},
 mean: 0.65718, std: 0.02186, params: {'features__transf

In [156]:
sorted(clfs[2][1].grid_scores_, key=lambda _: _[1])

[mean: 0.39950, std: 0.04315, params: {'features__transformer_weights': {'time': 1, 'user': 3, 'age': 0, 'text': 1}},
 mean: 0.39970, std: 0.04349, params: {'features__transformer_weights': {'time': 3, 'user': 3, 'age': 0, 'text': 1}},
 mean: 0.40129, std: 0.02910, params: {'features__transformer_weights': {'time': 3, 'user': 1, 'age': 1, 'text': 1}},
 mean: 0.40132, std: 0.02914, params: {'features__transformer_weights': {'time': 3, 'user': 1, 'age': 3, 'text': 1}},
 mean: 0.40258, std: 0.03143, params: {'features__transformer_weights': {'time': 3, 'user': 3, 'age': 1, 'text': 1}},
 mean: 0.40390, std: 0.03071, params: {'features__transformer_weights': {'time': 3, 'user': 3, 'age': 3, 'text': 1}},
 mean: 0.40459, std: 0.03272, params: {'features__transformer_weights': {'time': 3, 'user': 1, 'age': 0, 'text': 1}},
 mean: 0.40667, std: 0.03932, params: {'features__transformer_weights': {'time': 1, 'user': 3, 'age': 3, 'text': 1}},
 mean: 0.40864, std: 0.03596, params: {'features__transf