In [2]:
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion

from transformers import *
from corpus_helper import CorpusStreamer

init_config()



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
feature_extraction = Pipeline([
    ("TextCleaner",TextCleaner()),
    ("NamedEntityMasker", NamedEntityMasker(["PERSON"], maskwith="person")),
    ("TextFeatureExtractor", TextFeatureExtractor()),
])


feature_encoding = Pipeline([("_union", FeatureUnion([
        ("_W2V", Pipeline([
            ("_ColumnSelector_Words", ColumnSelector([0])),
            ("TextVectorizer", TextVectorizer())
        ])),
        ("_OtherFeatures", Pipeline([
            ("_ColumnSelector_!Words", ColumnSelector([0], inverse=True)),
            ("_union", FeatureUnion([
                ("_discrete_features", Pipeline([
                    ("TypeSelector_Discrete", TypeSelector(["object", "category"])),
                    ("LabelBinarizer", OneHotEncoder())
                ])),
                ("_continous_features", Pipeline([
                    ("TypeSelector_Continuous", TypeSelector(["number"])),
                    ("MinMaxScaler", MinMaxScaler(feature_range=[-1,1]))
                ]))
            ]))
        ])),
    ]))
])

In [None]:
teststring = "This is a test with three sentences. Indices should be reset here. Sarah is not home."

In [None]:
dickensdata = CorpusStreamer("../DataAcquisition/data/dickens")
s2 = dickensdata.toString()

In [71]:
def apply_pipeline(X, pipeline, chunksize):
    Xt = None
    start = 0
    chunksize = 10**6
    while True:
        print("Iteration {} / {}".format(start // chunksize, len(X) // chunksize + 1))
        part = X[start:start+chunksize]
        Xt = pd.concat((Xt, pipeline.fit_transform(part)), ignore_index=True)

        if len(part) < chunksize:
            break

        start += chunksize
    
    return Xt

In [68]:
len(s2[0:10**6])

1000000

In [None]:
%%time
X = apply_pipeline(s2, feature_extraction, 10**6)

Iteration 0 / 34
Iteration 1 / 34
Iteration 2 / 34
Iteration 3 / 34
Iteration 4 / 34
Iteration 5 / 34
Iteration 6 / 34


In [63]:
encoded = feature_encoding.fit_transform(X)
with open("../DataAcquisition/data/wells/wells_enc.pkl", "wb") as f:
    pkl.dump(encoded, f)