In [1]:
import re

import numpy as np
import pandas as pd

In [2]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Heschmat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Heschmat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Heschmat\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

In [4]:
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix

In [5]:
df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
df.head(3)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,category,category:confidence,category_gold,id,screenname,text
0,662822308,False,finalized,3,2/18/15 4:31,Information,1.0,,4.36528e+17,Barclays,Barclays CEO stresses the importance of regula...
1,662822309,False,finalized,3,2/18/15 13:55,Information,1.0,,3.86013e+17,Barclays,Barclays announces result of Rights Issue http...
2,662822310,False,finalized,3,2/18/15 8:43,Information,1.0,,3.7958e+17,Barclays,Barclays publishes its prospectus for its å£5....


In [6]:
df['category'].value_counts()

Information    2129
Action          724
Dialogue        226
Exclude          39
Name: category, dtype: int64

In [7]:
df['category:confidence'].value_counts(dropna= False)

1.0000    2430
0.6614      35
0.6643      33
0.6747      32
0.6775      29
          ... 
0.7202       1
0.8860       1
0.8570       1
0.6568       1
0.9041       1
Name: category:confidence, Length: 194, dtype: int64

Only keep the rows with `confidence` of 100%. Also, remove the `category` of `Exclude`, as there are very few of them anyway.

## 0. Load the dataset

In [8]:
def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

In [9]:
X, y = load_data()

In [10]:
X[0]

'Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG'

### 1. Clean & Tokenize

Replace the urls in the messages with `urlplaceholder`:

In [11]:
# RegEx for url pattern
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [12]:
msg = X[0]
msg

'Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG'

In [13]:
re.findall(url_regex, msg)

['http://t.co/Ge9Lp7hpyG']

In [14]:
url = 'http://t.co/Ge9Lp7hpyG'
msg.replace(url, 'urlplaceholder')

'Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  urlplaceholder'

In [15]:
def tokenize(txt):
    # get list of all urls using regex
    detected_urls = re.findall(url_regex, txt)
    
    # replace each url in text string with placeholder
    for url in detected_urls:
        txt = txt.replace(url, 'urlplaceholder')

    # tokenize text
    tokens = word_tokenize(txt.lower())
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer
    # Lemmatize and remove the white spaces - leading & trailing
    tokens_clean = [lemmatizer().lemmatize(token.strip()) for token in tokens]

    return tokens_clean

In [16]:
# test out function
X, y = load_data()
for msg in X[:5]:
    tokens = tokenize(msg)
    print(msg)
    print('=======> ', tokens, '\n')
    print('=' * 72)

Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG

Barclays announces result of Rights Issue http://t.co/LbIqqh3wwG

Barclays publishes its prospectus for its å£5.8bn Rights Issue: http://t.co/YZk24iE8G6

Barclays Group Finance Director Chris Lucas is to step down at the end of the week due to ill health http://t.co/nkuHoAfnSD

Barclays announces that Irene McDermott Brown has been appointed as Group Human Resources Director http://t.co/c3fNGY6NMT



## 2. Machine Learning Workflow

### Step 1: Load data and perform a train test split

In [17]:
# load data
X, y = load_data()

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 0)

### Step 2: Train classifier

In [18]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer= tokenize)
tfidf = TfidfTransformer()
clf = RandomForestClassifier(random_state= 0)

# Fit and/or transform each to the data
X_train_counts = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
clf.fit(X_train_tfidf, y_train)

RandomForestClassifier(random_state=0)

### Step 3: Predict on test data

In [19]:
# Transform test data
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)

# Predict test labels
y_test_pred = clf.predict(X_test_tfidf)

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [20]:
labels = np.unique(y_test_pred)
confusion_mat = confusion_matrix(y_test, y_test_pred, labels= labels)
accuracy = (y_test == y_test_pred).mean().round(3)

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 85   0  34]
 [  2  31   6]
 [  3   2 438]]
Accuracy: 0.922


# Final Step: Refactor
Organize these steps into the following functions.

In [21]:
def display_results(y_test, y_pred):
    labels = np.unique(y_test_pred)
    confusion_mat = confusion_matrix(y_test, y_test_pred, labels= labels)
    accuracy = (y_test == y_test_pred).mean().round(3)

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

In [22]:
def main_naive():
    # load data
    X, y = load_data()
    # Split the data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 6)
    
    # Instantiate transformers and classifier
    vect = CountVectorizer(tokenizer= tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier(random_state= 0)

    # Fit and/or transform each to the data
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)
    clf.fit(X_train_tfidf, y_train)
    
    # Transform test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    # Predict test labels
    y_test_pred = clf.predict(X_test_tfidf)
    
    display_results(y_test, y_test_pred)

In [23]:
main_naive()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 18   5  84]
 [  5   0  23]
 [ 67  28 371]]
Accuracy: 0.647


### Advantages of Using Pipeline:   
1. Simplicity and Convencience   
2. Optimizing Entire Workflow, including data transformation and modeling steps. 
3. Preventing Data leakage   


## 3. Implementing Pipeline

In [24]:
def main_pipe():
    # Load the data
    X, y = load_data()
    # Split data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 6)

    # Build pipeline
    pipe = Pipeline(steps = [
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier(random_state= 0))
    ])
        
    # Train classifier
    pipe.fit(X_train, y_train)

    # Predict on test data
    y_test_pred = pipe.predict(X_test)

    # Display results
    display_results(y_test, y_test_pred)

In [25]:
main_pipe()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 18   5  84]
 [  5   0  23]
 [ 67  28 371]]
Accuracy: 0.647


## 4. Pipelines And Feature Unions
- A `pipeline` performs a list of steps in a _linear sequence_, while a `feature union` performs a list of steps in _parallel_ and then combines their results.
- In more complex workflows, multiple feature unions are often used within pipelines, and multiple pipelines are used within feature unions.

### Creating Custom Transformer

Let's build a `case normalizer`, which simply converts all text to lowercase. Remember, all estimators have a fit method, and since this is a transformer, it also has a transform method.   

- __fit()__: This takes in a 2D array X for the feature data and a 1d array y for the target labels. Inside the fit method, we simply return self. This allows us to chain methods together, since the result on calling fit on the transformer is still the transformer object. This method is required to be compatible with scikit-learn.   

- __transform()__: The transform function is where we include the code that, well, transforms the data.

In [26]:
class CaseNormalizer(BaseEstimator, TransformerMixin):
    """Lowercase the text."""
    
    def fit(self, X, y= None):
        return self
    
    def transform(self, X):
        return pd.Series(X).apply(lambda x: x.lower()).values

In [27]:
case_normalizer = CaseNormalizer()

X = np.array(['Implementing', 'a', 'Custom', 'Transformer', 'from', 'SCIKIT-LEARN'])

case_normalizer.transform(X)

array(['implementing', 'a', 'custom', 'transformer', 'from',
       'scikit-learn'], dtype=object)

In [28]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    """Determine if a text starts with a verb."""

    def starting_verb(self, text):
        sentence_list = sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [29]:
def pipeline_parallel():
    pipeline = Pipeline(steps = [
        ('gen_features', FeatureUnion(transformer_list= [
            ('text_pipeline', Pipeline(steps= [
                ('vect', CountVectorizer(tokenizer= tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
            
            ('has_lead_verb', StartingVerbExtractor())
        ])),
        
        ('rfclf', RandomForestClassifier())
    ])
    
    return pipeline

In [30]:
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = pipeline_parallel()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    display_results(y_test, y_pred)

In [31]:
main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 18   4  95]
 [  2   3  25]
 [ 70  26 358]]
Accuracy: 0.631


## 5. GridSearch

In [32]:
def build_model():
    pipeline = Pipeline([
        ('features', FeatureUnion([
            
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),
    
        ('clf', RandomForestClassifier())
    ])
    
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
        'features__text_pipeline__vect__max_features': (None, 5000),
        #'features__text_pipeline__tfidf__use_idf': (True, False),
        'clf__n_estimators': [50, 200],
        'clf__min_samples_split': [2, 3, 4]
    }

    # create grid search object
    cv = GridSearchCV(pipeline, parameters)
    
    return cv

In [33]:
def display_results(cv, y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    print("\nBest Parameters:", cv.best_params_)

In [34]:
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = build_model()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    display_results(model, y_test, y_pred)

main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[107   0  25]
 [  1  30   1]
 [  1   1 435]]
Accuracy: 0.9517470881863561

Best Parameters: {'clf__min_samples_split': 2, 'clf__n_estimators': 200, 'features__text_pipeline__vect__max_df': 0.75, 'features__text_pipeline__vect__max_features': 5000, 'features__text_pipeline__vect__ngram_range': (1, 2)}
