### Note, this notebook uses `nltk`, which I don't have in the environment, so you'll have to install it yourself:

e.g.;

- `pip install nltk`
- `python -m import nltk; nltk.download()`

# Write Your Own Transformers

Sometimes you want to create pipelines for things not provided by scikit-learn. For example, you may want to use your own text-processing pipeline, or embedding methods from a different library (i.e. `gensim`)

In [4]:
import re
import spacy

import pandas as pd

In [5]:
# combine all daasets
office_df = pd.read_csv('../data/dundermifflin.csv')
print('Office df shape:', office_df.shape)

overwatch_df = pd.read_csv('../data/overwatch.csv')
print('Overwatch df shape:', overwatch_df.shape)

Office df shape: (41467, 6)
Overwatch df shape: (47774, 6)


In [6]:
# combine data into single DataFrame
all_df = pd.concat([office_df, overwatch_df])
print(all_df.shape)
all_df.head()

(89241, 6)


Unnamed: 0.1,Unnamed: 0,title,id,subreddit,body,comment
0,0,Should I call you Jimothy?,ay2o5j,DunderMifflin,,I read somewhere that most people who think th...
1,1,Should I call you Jimothy?,ay2o5j,DunderMifflin,,I got Oscar Martinez... Michael am I gay?
2,2,Should I call you Jimothy?,ay2o5j,DunderMifflin,,That is correct.
3,3,Should I call you Jimothy?,ay2o5j,DunderMifflin,,Am I the only one who took slight pride in get...
4,4,Should I call you Jimothy?,ay2o5j,DunderMifflin,,You got: Creed Bratton\r\nYou're very mysterio...


In [7]:
from sklearn.model_selection import train_test_split

X = all_df['comment']
y = all_df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9)

In [8]:
# Windows
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) 

# Unix System (mac/ubuntu/etc.)
# nlp = spacy.load('en', disable=['ner', 'parser']) 

stop_words = nlp.Defaults.stop_words

## Define our own text processor transformer

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin


class TextProcessor(BaseEstimator, TransformerMixin):
    """
    Custom sklearn transformer to preprocess data.
    Data fit to should be raw document (not split by tokens).
    """

    def __init__(self, remove_stopwords=True, do_lemmatize=False, do_stem=False, stop_words=None, return_list=True):
        """
        Parameters
        ----------
        remove_stopwords: Boolean
            Whether or not to remove stopwords.
        do_lemmatize: Boolean
            Whether or not to lemmatize tokens.
        do_stem: Boolean
            Whether or not to stem tokens.
        stop_words: iterable
            List of stop words, each word of type str.
        return_list: Boolean
            Whether or not to return list or raw document.
            Some sklearn vectorizers prefer a raw document.
        Note - if neither `do_lemmatize` nor `do_stem` are specified, the transform
        method will preprocess using `fast_process`, which is much faster.
        """
        pass

   
    def clean_token(self, token):
        """
        """
        c_token = re.sub("[^A-Za-z']+", ' ', str(token))
        # lower-case and strip whitespace
        c_token = c_token.lower().strip()
        # remove stopwords
        if c_token in stop_words:
            return ''
        return c_token

    def preprocess(self, document):
        """
        Preprocessing method that stems and/or lemmatizes token
        during a single pass through data.
        """
        if not isinstance(document, str):
            return ['']
        clean_tokens = [
            self.clean_token(token) 
            for token in document.split()
        ]
        # remove empty strings
        cleaned_tokens = [com for com in clean_tokens 
                          if com != '']
        return cleaned_tokens

    def fit(self, X, y=None):
        """
        `fit` method required for inclusion in sklearn pipeline object.
        Returns data as is.
        """
        return self

    def transform(self, documents):
        """
        Preprocesses each document in `documents`.
        Parameters
        ----------
        documents: iterable
            List of documents. Each document raw, untokenized text.
        Returns
        -------
        iterable: list of lists. Each list contains the preprocessed
            tokens for a document.
        """
        document_list = []
        for document in documents:
            document_list.append(self.preprocess(document))
        return document_list

In [10]:
tp = TextProcessor()
tp.fit(X_train)
cleaned_data = tp.transform(X_train)

In [12]:
print(cleaned_data[1:5])

[['welcome', 'reddit'], ['holy', 'shit', "you're", 'right', 'thought', 'looked', 'familiar', 'edit', 'oscar', 'people', 'earth'], ['ironic'], ['wait', 'toby', 'oscar', 'hockey']]


## Custom sklearn object wrapping a gensim object

In [13]:
class GensimBOW(BaseEstimator, TransformerMixin):
    """
    Custom sklearn transformer to convert tokenized,
    preprocessed data to bag-of-words representation.
    """

    def __init__(self, use_sparse_representation=False):
        """
        Parameters
        ----------
        use_sparse_representation: Boolean (default=False)
            When True, a sparse representation of the array is returned.
                Use this when feeding into a gensim model.
            When False, the full array is returned.
                Use this if feeding into sklearn estimator.
        """
        self.id2word = None
        self.use_sparse_representation = use_sparse_representation

    def fit(self, documents, labels=None):
        """
        Creates map between words and their integer ids,
        storing it as `self.id2word`.
        Parameters
        ----------
        documents: iterable
            List of documents; each document a list of preprocessed tokens.
        labels:
            Optional list of same size as documents, specifying label for each document.
        """
        from gensim.corpora.dictionary import Dictionary
        self.id2word = Dictionary(documents)
        return self

    def transform(self, documents):
        """
        Converts a collection of words to its bag-of-words representation.
        Parameters
        ----------
        documents: iterable
            List of documents. Each document must be a list of tokens.
        Returns
        -------
            generator: yields vectorized representation of each document.
        """
        from gensim.matutils import sparse2full
        if self.id2word is None:
            raise AttributeError('Must have a fit id2word in order'
                                 ' to call transform.')

        def generator():
            """
            Closure to mutate return type depending on value of `use_sparse_representation`.
            """
            for document in documents:
                docbow = self.id2word.doc2bow(document)
                yield sparse2full(docbow, len(self.id2word))
        return list(generator())

In [14]:
gbow = GensimBOW()

In [15]:
gbow.fit(cleaned_data)

GensimBOW(use_sparse_representation=False)

In [16]:
vectorized_data = gbow.transform(cleaned_data)

In [17]:
vectorized_data[1:5]

[array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)]

## Feeding our user-defined classes into a sklearn pipeline

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

mod = Pipeline([
    ('preprocessor', TextProcessor()),
    ('vectorizer', GensimBOW()),
    ('classifier', SGDClassifier())
])

In [19]:
mod.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('preprocessor', TextProcessor(do_lemmatize=None, do_stem=None, remove_stopwords=None,
       return_list=None, stop_words=None)), ('vectorizer', GensimBOW(use_sparse_representation=False)), ('classifier', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=Fal...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [None]:
y_preds = mod.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_preds, y_test))