# Implementing Feature Union
Using the given custom transformer, `StartingVerbExtractor`, add a feature union to your pipeline to incorporate a feature that indicates with a boolean value whether the starting token of a post is identified as a verb.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mahmo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mahmo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mahmo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [3]:
from custom_transformer import StartingVerbExtractor

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mahmo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mahmo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mahmo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Build your pipeline to have this structure:
- Pipeline
    - feature union
        - text pipeline
            - count vectorizer
            - TFIDF transformer
        - starting verb extractor
    - classifier

In [None]:
pipeline = Pipeline([
                ("Features", FeatureUnion([
                    ("text_pipeline", Pipeline([('count', CountVectorizer(tokenizer=tokenize)),
                                                ('tf-idf', TfidfTransformer())])),
                    ("verb", StartingVerbExtractor())
                ])),
                ("clf", RandomForestClassifier())
            ])

In [4]:
def model_pipeline():
    
    pipeline = Pipeline([
                ("Features", FeatureUnion([
                    ("text_pipeline", Pipeline([('count', CountVectorizer(tokenizer=tokenize)),
                                                ('tf-idf', TfidfTransformer())])),
                    ("verb", StartingVerbExtractor())
                ])),
                ("clf", RandomForestClassifier())
            ])
    return pipeline

### Run program to test

In [5]:
def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    text = re.sub(r'http\S+', 'url_placeholder', text.lower()).strip()
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemms = [lemmatizer.lemmatize(token) for token in tokens]
    return lemms

def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = model_pipeline()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    display_results(y_test, y_pred)

main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 85   1  29]
 [  0  27   1]
 [  2   1 455]]
Accuracy: 0.9434276206322796
