In [1]:
# data manipulation
import utils
import pandas as pd
import numpy as np

# modeling
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

In [2]:
X_train = utils.load('X_train')
y_train = utils.load('y_train')

In [3]:
# text transformation pipes
clean_text = Pipeline(
    [
        ('stack', FunctionTransformer(utils.stack_questions, validate=False)),
        ('clean', FunctionTransformer(utils.clean_questions, validate=False))

    ]
)

lemma_text = Pipeline(
    [
        ('lemma', FunctionTransformer(utils.apply_lemma, validate=False))
    ]
)

# feature engineering pipes
single_question_pipe = Pipeline(
    [
        ('dist', FunctionTransformer(utils.add_min_max_avg_distance_features, validate=False)),
        ('unstack', FunctionTransformer(utils.unstack_questions, validate=False))
    ]
)

pair_question_pipe = Pipeline(
    [
        ('ngram_sim', FunctionTransformer(utils.calc_ngram_similarity, kw_args={'n_grams':[1, 2, 3]}, validate=False))
    ]
)

# build features on the cleaned text only
clean_text_features = Pipeline(
    [
        ('clean', clean_text),
        ('feats', FeatureUnion(
            [
                ('pair', pair_question_pipe),
                ('single', single_question_pipe)
            ]
        ))
    ]
)

# build features on the cleanned and lemmatized text features
lemma_text_features = Pipeline(
    [
        ('clean', clean_text),
        ('lemma', lemma_text),
        ('feats', FeatureUnion(
            [
                ('pair', pair_question_pipe),
                ('single', single_question_pipe)
            ]
        ))
    ]
)

# pre-process pipe
feature_transformation = Pipeline(
    [
        ('feats', FeatureUnion(
            [
                ('clean_text_features', clean_text_features),
                ('lemma_text_features', lemma_text_features)
            ]
        ))
    ]
)


In [5]:
%%time
X_temp = utils.stack_questions(X_train)

CPU times: user 1.72 s, sys: 80 ms, total: 1.8 s
Wall time: 481 ms


In [6]:
%%time
X_temp = utils.clean_questions(X_temp)

CPU times: user 7.98 s, sys: 48 ms, total: 8.03 s
Wall time: 8.03 s


In [7]:
%%time
X_temp_1 = utils.calc_ngram_similarity(X_temp, n_grams=[1,2,3])

CPU times: user 13min 58s, sys: 1min 43s, total: 15min 42s
Wall time: 5min 25s


In [8]:
%%time
X_temp_2 = utils.add_min_max_avg_distance_features(X_temp)

CPU times: user 22min 22s, sys: 1min 1s, total: 23min 24s
Wall time: 8min 35s


In [9]:
X_clean_stack = utils.clean_questions(utils.stack_questions(X_train))

In [23]:
%%time
master_doc = []
for doc in utils.nlp.pipe(X_clean_stack, disable=['parser', 'ner'], batch_size=100000):
    master_doc.append(doc)

CPU times: user 6min 54s, sys: 48.7 s, total: 7min 43s
Wall time: 4min 22s


In [15]:
utils.save(master_doc, 'master_doc')

In [12]:
master_doc_2 = utils.load('master_doc')

In [13]:
master_doc_2[0]

what is the step by step guide to invest in share market in india

In [None]:
import inspect
lines = inspect.getsource(foo)
print(lines)

In [None]:
1,819,194

In [21]:
len(X_temp)*3

1819194

In [18]:
import cProfile
import pstats
# import my_slow_module
cProfile.run('utils.add_min_max_avg_distance_features(X_temp)', 'restats')
p = pstats.Stats('restats')
p.sort_stats('cumulative').print_stats(30)

Fri Nov 30 22:21:27 2018    restats

         171431340 function calls (171410702 primitive calls) in 754.282 seconds

   Ordered by: cumulative time
   List reduced from 197 to 30 due to restriction <30>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000  754.282  754.282 {built-in method builtins.exec}
        1    0.568    0.568  754.282  754.282 <string>:1(<module>)
        1    8.136    8.136  753.714  753.714 /home/ubuntu/question_intent_classification/py_files/utils.py:321(add_min_max_avg_distance_features)
  1819194   14.654    0.000  300.451    0.000 /home/ubuntu/question_intent_classification/py_files/utils.py:302(calc_min_max_avg_distance)
   606399    0.679    0.000  278.963    0.000 /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/spacy/language.py:521(pipe)
      608    2.222    0.004  278.283    0.458 pipeline.pyx:430(pipe)
      607    2.220    0.004  219.754    0.362 pipeline.pyx:437(predict)
11533

<pstats.Stats at 0x7f8a6b4fe8d0>

In [24]:
from skopt import BayesSearchCV

In [None]:
BayesSearchCV()

In [25]:
X_train = utils.load('X_train')
y_train = utils.load('y_train')

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_t, X_v, y_t, y_v = train_test_split(X_train, y_train, stratify=y_train, test_size=0.33, random_state=42)