# Implementing Feature Union
Using the given custom transformer, `StartingVerbExtractor`, add a feature union to your pipeline to incorporate a feature that indicates with a boolean value whether the starting token of a post is identified as a verb.

In [46]:
%load_ext pycodestyle_magic
%pycodestyle_on

The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


In [47]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jinchaochen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jinchaochen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jinchaochen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [48]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [49]:
from custom_transformer import StartingVerbExtractor

In [50]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

1:56: W605 invalid escape sequence '\('
1:58: W605 invalid escape sequence '\)'
1:56: W605 invalid escape sequence '\('
1:58: W605 invalid escape sequence '\)'
1:80: E501 line too long (92 > 79 characters)
1:56: W605 invalid escape sequence '\('
1:58: W605 invalid escape sequence '\)'
1:80: E501 line too long (92 > 79 characters)


### Build your pipeline to have this structure:
- Pipeline
    - feature union
        - text pipeline
            - count vectorizer
            - TFIDF transformer
        - starting verb extractor
    - classifier

In [51]:
Pipeline([
    ('features', FeatureUnion([
        ("txt_pipeline", Pipeline([
            ("vect", CountVectorizer(tokenizer=tokenize)),
            ("tfidf", TfidfTransformer())
        ])),
        ('verb_extrctor', StartingVerbExtractor())
    ])),
    ('clf', RandomForestClassifier())
])

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('txt_pipeline',
                                                 Pipeline(memory=None,
                                                          steps=[('vect',
                                                                  CountVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.int64'>,
                                                                                  encoding='utf-8',
                                                                                  input='content',
                                                                                  lowe

In [52]:
from sklearn.pipeline import FeatureUnion

In [56]:
def model_pipeline():
    
    # Pipeline for text vectorization and tfidf
    text_pipeline = Pipeline([("vect", CountVectorizer(tokenizer=tokenize)),
                              ("tfidf", TfidfTransformer())])
    
    # Include feature extra estimator through featureunion 
    pipeline = Pipeline([('features', FeatureUnion([
                                    ('text_pipeline', text_pipeline),
                                    ('verb_extrctor', StartingVerbExtractor())])),
                         ('clf', RandomForestClassifier())])
    return pipeline

2:1: W293 blank line contains whitespace
6:1: W293 blank line contains whitespace
7:38: W291 trailing whitespace
8:5: E265 block comment should start with '# '
8:26: W291 trailing whitespace
11:80: E501 line too long (82 > 79 characters)
2:1: W293 blank line contains whitespace
6:1: W293 blank line contains whitespace
7:38: W291 trailing whitespace
8:5: E265 block comment should start with '# '
8:26: W291 trailing whitespace
11:80: E501 line too long (82 > 79 characters)
2:1: W293 blank line contains whitespace
6:1: W293 blank line contains whitespace
7:38: W291 trailing whitespace
8:5: E265 block comment should start with '# '
8:26: W291 trailing whitespace
11:80: E501 line too long (82 > 79 characters)


In [54]:
# Solution from Udacity


def model_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', RandomForestClassifier())
    ])

    return pipeline

### Run program to test

In [1]:
def load_data():
    df = pd.read_csv('corporate_messaging.csv',
                     encoding='latin-1')
    df = df[(df["category:confidence"] == 1) &
            (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url,"urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = model_pipeline()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    display_results(y_test, y_pred)


main()

NameError: name 'pd' is not defined

In [None]:
b = 1
a = b + 1