# Tutorial
* https://www.youtube.com/watch?v=SG6jdlBx_vQ
    
* https://github.com/ZWMiller/nlp_pipe_manager/tree/master/nlp_pipeline_manager

* https://github.com/ZWMiller/nlp_pipe_manager/blob/master/nlp_pipeline_manager/pipeline_demo.ipynb

# Building a class to manage our NLP pipelines

Because it's such a pain to manage all the permutations of NLP cleaners/tokenizers/vectorizers/stemmers/etc, we're going to build a class that takes all of those pieces in and manages the pipelines for us.

In [145]:
s = 'resulting result results resulted resulting run UPPER CASE @you running ran No #results  😺 😺 😺@results FOUND. View all teams. MAD Prod Fundraistrick. 350 10th Ave, Suite 1100. San Diego, CA 92101 US. Back to top. Donor Support braistrick@stayclassy.org'
s1 = 'run bunda No results results found. View all teams. Prod Fundraistrick. 350 10th Ave, Suite 1100. San Diego, CA 92101 US. Back to top. Donor Support braistrick@stayclassy.org. http://localhost:8888/notebooks/nlp/cleaning_sandbox.ipynb https://www.w3schools.com/python/python_regex.asp'
s2 = 'https://www.w3schools.com/python/python_regex.asp No results results found. View all teams. Prod Fundraistrick. 350 10th Ave, Suite 1100. San Diego, CA 92101 US. Back to top. Donor Support braistrick@stayclassy.org. http://localhost:8888/notebooks/nlp/cleaning_sandbox.ipynb'
s3 = 'No results results found. View all teams. Prod Fundraistrick. 350 10th Ave, Suite 1100. San Diego, CA 92101 US. Back to top. Donor Support braistrick@stayclassy.org. http://localhost:8888/notebooks/nlp/cleaning_sandbox.ipynb'
s4 = 'bunda results results found view teams prod fundraistrick ave suite san diego back top donor support'
list_of_strings = [s, s1, s2, s3, s4]

import re
import string

dict_regex = {
    'hashtags': r'#(\w+)',
    # returns not only mentions, but
    # part of the email after the @
    'mentions': r'@(\w+)',
    'emails': r'',
    'links': r'https?:\/\/.*[\r\n]*',
    'remove_RT': '^RT[\s]+',
    'numbers': r'\d+',
    'symbols': r'',
    'punctionation2': '[^\w\s]',
    'punctionation': '[%s]' % re.escape(string.punctuation),
    'periods': '\.',
    'exclamation points': '\!',
    'question marks': '\?',
    'upper case words': '[A-Z][A-Z\d]+',
    # https://stackoverflow.com/questions/39536390/match-unicode-emoji-in-python-regex
    'emojis': '\d+(.*?)[\u263a-\U0001f645]',
    'emojis_work': "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']",
    'upper case': '[A-Z][A-Z\d]+'
}

regex_emojis = "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
list_of_regex_values = list(dict_regex.values())
list_of_regex_keys = list(dict_regex.keys())

sw = ['😺', '😺 😺', '😺 😺 😺', 'prod', 'suite', ' ']

# Class

In [1]:
import numpy as np
import sklearn
import matplotlib
import pandas as pd
import sklearn
import sys
import nltk

libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd), ('NLTK', nltk), ('sklearn',sklearn))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.7.6 (default, Jan  7 2020, 16:28:00) 
[Clang 11.0.0 (clang-1100.0.33.8)] 

Matplotlib Version: 3.2.0
Numpy Version: 1.18.1
Pandas Version: 0.25.3
NLTK Version: 3.5
sklearn Version: 0.22.2.post1


In [2]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
import scipy
%matplotlib inline
plt.style.use('seaborn')

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

class nlp_preprocessor:
   
    def __init__(self, vectorizer=CountVectorizer(), tokenizer=None, cleaning_function=None, 
                 stemmer=None, model=None):
        """
        A class for pipelining our data in NLP problems. The user provides a series of 
        tools, and this class manages all of the training, transforming, and modification
        of the text data.
        ---
        Inputs:
        vectorizer: the model to use for vectorization of text data
        tokenizer: The tokenizer to use, if none defaults to split on spaces
        cleaning_function: how to clean the data, if None, defaults to the in built class
        """
        if not tokenizer:
            tokenizer = self.splitter
        if not cleaning_function:
            cleaning_function = self.clean_text
        self.stemmer = stemmer
        self.tokenizer = tokenizer
        self.model = model
        self.cleaning_function = cleaning_function
        self.vectorizer = vectorizer
        self._is_fit = False
        
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')
        
    def clean_text(self, text, tokenizer, stemmer):
        """
        A naive function to lowercase all works can clean them quickly.
        This is the default behavior if no other cleaning function is specified
        """
        cleaned_text = []
        for post in text:
            cleaned_words = []
            for word in tokenizer(post):
                low_word = word.lower()
                if stemmer:
                    low_word = stemmer(low_word)
                cleaned_words.append(low_word)
            cleaned_text.append(' '.join(cleaned_words))
        return cleaned_text
    
    def fit(self, text):
        """
        Cleans the data and then fits the vectorizer with
        the user provided text
        """
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
        
    def transform(self, text):
        """
        Cleans any provided data and then transforms the data into
        a vectorized format based on the fit function. Returns the
        vectorized form of the data.
        """
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        return self.vectorizer.transform(clean_text)
    
    def save_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        pickle.dump(self.__dict__, open(filename+".mdl",'wb'))
        
    def load_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        if filename[-4:] != '.mdl':
            filename += '.mdl'
        self.__dict__ = pickle.load(open(filename,'rb'))

In [20]:
# example 1
corpus = ['BOB the builder', 'is a strange', 'caRtoon type thing']
nlp = nlp_preprocessor()

In [21]:
print(nlp.clean_text(corpus, tokenizer=None, stemmer=None))

TypeError: 'NoneType' object is not callable

In [22]:
nlp.fit(corpus)

In [23]:
pd.DataFrame(nlp.transform(corpus).toarray(), columns=nlp.vectorizer.get_feature_names())

Unnamed: 0,bob,builder,cartoon,is,strange,the,thing,type
0,1,1,0,0,0,1,0,0
1,0,0,0,1,1,0,0,0
2,0,0,1,0,0,0,1,1


## What if we want to swap pieces in?

In [24]:
def new_clean_text(text, tokenizer, stemmer):
    """
    A naive function to lowercase all works can clean them quickly.
    This is the default behavior if no other cleaning function is specified
    """
    cleaned_text = []
    for post in text:
        cleaned_words = []
        for word in tokenizer(post):
            low_word = word.lower()
            if low_word in ['builder']: # remove the word builder
                continue
            if stemmer:
                low_word = stemmer.stem(low_word)
            cleaned_words.append(low_word)
        cleaned_text.append(' '.join(cleaned_words))
    return cleaned_text

In [26]:
nlp2 = nlp_preprocessor()

In [27]:
from nltk.stem import PorterStemmer

nlp2 = nlp_preprocessor(cleaning_function=new_clean_text, vectorizer=CountVectorizer(lowercase=False), 
                        stemmer=PorterStemmer())

In [28]:
nlp2.fit(corpus)
nlp2.vectorizer.get_feature_names()

['bob', 'cartoon', 'is', 'strang', 'the', 'thing', 'type']

In [29]:
pd.DataFrame(nlp2.transform(corpus).toarray(), columns=nlp2.vectorizer.get_feature_names())

Unnamed: 0,bob,cartoon,is,strang,the,thing,type
0,1,0,0,0,1,0,0
1,0,0,1,1,0,0,0
2,0,1,0,0,0,1,1


## What about using TF-IDF instead?

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

nlp3 = nlp_preprocessor(cleaning_function=new_clean_text, vectorizer=TfidfVectorizer(lowercase=False))

In [31]:
nlp3.fit(corpus)
nlp3.vectorizer.get_feature_names()
pd.DataFrame(nlp3.transform(corpus).toarray(), columns=nlp3.vectorizer.get_feature_names())

Unnamed: 0,bob,cartoon,is,strange,the,thing,type
0,0.707107,0.0,0.0,0.0,0.707107,0.0,0.0
1,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0
2,0.0,0.57735,0.0,0.0,0.0,0.57735,0.57735


# So what? Let's use some real data to try some different modeling approaches

In [32]:
from sklearn import datasets

categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball']
ng_train = datasets.fetch_20newsgroups(subset='train', 
                                       categories=categories, 
                                       remove=('headers', 
                                               'footers', 'quotes'))
ng_train_data = ng_train.data
ng_train_targets = ng_train.target

ng_test = datasets.fetch_20newsgroups(subset='test', 
                                       categories=categories, 
                                       remove=('headers', 
                                               'footers', 'quotes'))

ng_test_data = ng_test.data
ng_test_targets = ng_test.target

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [33]:
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import PorterStemmer

nlp = nlp_preprocessor(stemmer=PorterStemmer())
nlp2 = nlp_preprocessor(vectorizer=CountVectorizer(lowercase=False))
nlp3 = nlp_preprocessor(cleaning_function=new_clean_text, vectorizer=TfidfVectorizer(lowercase=False))
nlp_chains = [nlp, nlp2, nlp3]

for ix, chain in enumerate(nlp_chains):
    nb = MultinomialNB()
    chain.fit(ng_train_data)
    train_data = chain.transform(ng_train_data)
    test_data = chain.transform(ng_test_data)
    nb.fit(train_data, ng_train_targets)
    accuracy = nb.score(test_data, ng_test_targets)
    print("Chain {}: {}".format(ix, accuracy))

TypeError: 'PorterStemmer' object is not callable

## Summary

This allows us to sweep all of the preprocessing into a class where we can control the pieces and parts that go in, and can see what comes out. If we wanted to, we could even add a model into the class as well and put the whole pipe into a single class that manages all of our challenges. In this case, we've left it outside for demo purposes. This also saves all of the pieces together, so we can just pickle a class object and that will keep the whole structure of our models together - such as the vectorizer and the stemmer we used, as well as the cleaning routine, so we don't lose any of the pieces if we want to run it on new data later.

# Adding a model to the mix

Depending on the type of model we want to build, we'll need to wrap the preprocessing class a little bit differently for the specific case. For example, if we're doing supervised learning, we'll want a `predict` method. If we're doing topic modeling, we'll want a `transform` method. To make that happen, I'll show a few examples below that wrap around the preprocessing class to make the most of it. 

#### Supervised: Classification

Here we'll write a class to predict a class given the text of the document. 

In [None]:
class supervised_nlp:
    
    def __init__(self, model, preprocessing_pipeline=None):
        """
        A pipeline for doing supervised nlp. Expects a model and creates
        a preprocessing pipeline if one isn't provided.
        """
        self.model = model
        self._is_fit = False
        if not preprocessing_pipeline:
            self.preprocessor = nlp_preprocessor()
        else:
            self.preprocessor = preprocessing_pipeline
        
    def fit(self, X, y):
        """
        Trains the vectorizer and model together using the 
        users input training data.
        """
        self.preprocessor.fit(X)
        train_data = self.preprocessor.transform(X)
        self.model.fit(train_data, y)
        self._is_fit = True
    
    def predict(self, X):
        """
        Makes a prediction on the data provided by the users using the 
        preprocessing pipeline and provided model.
        """
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        test_data = self.preprocessor.transform(X)
        preds = self.model.predict(test_data)
        return preds
    
    def score(self, X, y):
        """
        Returns the accuracy for the model after using the trained
        preprocessing pipeline to prepare the data.
        """
        test_data = self.preprocessor.transform(X)
        return self.model.score(test_data, y)
    
    def save_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        pickle.dump(self.__dict__, open(filename+".mdl",'wb'))
        
    def load_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        if filename[-4:] != '.mdl':
            filename += '.mdl'
        self.__dict__ = pickle.load(open(filename,'rb'))

In [None]:
nlp_pipe = supervised_nlp(MultinomialNB(), nlp)
nlp_pipe.fit(ng_train_data, ng_train_targets)
nlp_pipe.score(ng_test_data, ng_test_targets)

Swap out the model for something different.

In [None]:
from sklearn.svm import LinearSVC

nlp_pipe = supervised_nlp(LinearSVC(), nlp)
nlp_pipe.fit(ng_train_data, ng_train_targets)
nlp_pipe.score(ng_test_data, ng_test_targets)

#### Unsupervised: Topic Modeling

We don't want to make a prediction with this example, simply to find topics and have the ability to cast our data into the "topic space" from the "word space." With this in mind, we'll add a transform feature and also the ability to print out the topics.

In [None]:
class topic_modeling_nlp:
    
    def __init__(self, model, preprocessing_pipeline=None):
        """
        A pipeline for doing supervised nlp. Expects a model and creates
        a preprocessing pipeline if one isn't provided.
        """
        self.model = model
        self._is_fit = False
        if not preprocessing_pipeline:
            self.preprocessor = nlp_preprocessor()
        else:
            self.preprocessor = preprocessing_pipeline
        
    def fit(self, X):
        """
        Trains the vectorizer and model together using the 
        users input training data.
        """
        self.preprocessor.fit(X)
        train_data = self.preprocessor.transform(X)
        self.model.fit(train_data)
        self._is_fit = True
    
    def transform(self, X):
        """
        Makes a prediction on the data provided by the users using the 
        preprocessing pipeline and provided model.
        """
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        test_data = self.preprocessor.transform(X)
        preds = self.model.transform(test_data)
        return preds
    
    def print_topics(self, num_words=10):
        """
        A function to print out the top words for each topic
        """
        feat_names = self.preprocessor.vectorizer.get_feature_names()
        for topic_idx, topic in enumerate(self.model.components_):
            message = "Topic #%d: " % topic_idx
            message += " ".join([feat_names[i]
                                 for i in topic.argsort()[:-num_words - 1:-1]])
            print(message)
            
    def save_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        pickle.dump(self.__dict__, open(filename+".mdl",'wb'))
        
    def load_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        if filename[-4:] != '.mdl':
            filename += '.mdl'
        self.__dict__ = pickle.load(open(filename,'rb'))

In [None]:
from sklearn.decomposition import TruncatedSVD

cv = CountVectorizer(stop_words='english', token_pattern='\\b[a-z][a-z]+\\b')
cleaning_pipe = nlp_preprocessor(vectorizer=cv)
topic_chain = topic_modeling_nlp(TruncatedSVD(n_components=15), preprocessing_pipeline=cleaning_pipe)

In [None]:
topic_chain.fit(ng_train_data)
topic_chain.transform(ng_train_data).shape

In [None]:
topic_chain.print_topics()

Swap out the model for something different.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
topic_chain = topic_modeling_nlp(LatentDirichletAllocation(n_components=15), preprocessing_pipeline=cleaning_pipe)

In [None]:
topic_chain.fit(ng_train_data)
topic_chain.transform(ng_train_data).shape
topic_chain.print_topics()