In [122]:
import re

import numpy as np
import pandas as pd

In [123]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Heschmat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Heschmat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [124]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [125]:
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix

In [126]:
df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
df.head(3)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,category,category:confidence,category_gold,id,screenname,text
0,662822308,False,finalized,3,2/18/15 4:31,Information,1.0,,4.36528e+17,Barclays,Barclays CEO stresses the importance of regula...
1,662822309,False,finalized,3,2/18/15 13:55,Information,1.0,,3.86013e+17,Barclays,Barclays announces result of Rights Issue http...
2,662822310,False,finalized,3,2/18/15 8:43,Information,1.0,,3.7958e+17,Barclays,Barclays publishes its prospectus for its å£5....


In [127]:
df['category'].value_counts()

Information    2129
Action          724
Dialogue        226
Exclude          39
Name: category, dtype: int64

In [128]:
df['category:confidence'].value_counts(dropna= False)

1.0000    2430
0.6614      35
0.6643      33
0.6747      32
0.6775      29
          ... 
0.7202       1
0.8860       1
0.8570       1
0.6568       1
0.9041       1
Name: category:confidence, Length: 194, dtype: int64

Only keep the rows with `confidence` of 100%. Also, remove the `category` of `Exclude`, as there are very few of them anyway.

## 0. Load the dataset

In [129]:
def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

In [130]:
X, y = load_data()

In [131]:
X[0]

'Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG'

### 1. Clean & Tokenize

Replace the urls in the messages with `urlplaceholder`:

In [132]:
# RegEx for url pattern
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [133]:
msg = X[0]
msg

'Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG'

In [134]:
re.findall(url_regex, msg)

['http://t.co/Ge9Lp7hpyG']

In [135]:
url = 'http://t.co/Ge9Lp7hpyG'
msg.replace(url, 'urlplaceholder')

'Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  urlplaceholder'

In [136]:
def tokenize(txt):
    # get list of all urls using regex
    detected_urls = re.findall(url_regex, txt)
    
    # replace each url in text string with placeholder
    for url in detected_urls:
        txt = txt.replace(url, 'urlplaceholder')

    # tokenize text
    tokens = word_tokenize(txt.lower())
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer
    # Lemmatize and remove the white spaces - leading & trailing
    tokens_clean = [lemmatizer().lemmatize(token.strip()) for token in tokens]

    return tokens_clean

In [137]:
# test out function
X, y = load_data()
for msg in X[:5]:
    tokens = tokenize(msg)
    print(msg)
    print('=======> ', tokens, '\n')
    print('=' * 72)

Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG

Barclays announces result of Rights Issue http://t.co/LbIqqh3wwG

Barclays publishes its prospectus for its å£5.8bn Rights Issue: http://t.co/YZk24iE8G6

Barclays Group Finance Director Chris Lucas is to step down at the end of the week due to ill health http://t.co/nkuHoAfnSD

Barclays announces that Irene McDermott Brown has been appointed as Group Human Resources Director http://t.co/c3fNGY6NMT



## 2. Machine Learning Workflow

### Step 1: Load data and perform a train test split

In [138]:
# load data
X, y = load_data()

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 0)

### Step 2: Train classifier

In [139]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer= tokenize)
tfidf = TfidfTransformer()
clf = RandomForestClassifier(random_state= 0)

# Fit and/or transform each to the data
X_train_counts = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
clf.fit(X_train_tfidf, y_train)

RandomForestClassifier(random_state=0)

### Step 3: Predict on test data

In [140]:
# Transform test data
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)

# Predict test labels
y_test_pred = clf.predict(X_test_tfidf)

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [141]:
labels = np.unique(y_test_pred)
confusion_mat = confusion_matrix(y_test, y_test_pred, labels= labels)
accuracy = (y_test == y_test_pred).mean().round(3)

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 85   0  34]
 [  2  31   6]
 [  3   2 438]]
Accuracy: 0.922


# Final Step: Refactor
Organize these steps into the following functions.

In [142]:
def display_results(y_test, y_pred):
    labels = np.unique(y_test_pred)
    confusion_mat = confusion_matrix(y_test, y_test_pred, labels= labels)
    accuracy = (y_test == y_test_pred).mean().round(3)

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

In [143]:
def main_naive():
    # load data
    X, y = load_data()
    # Split the data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 6)
    
    # Instantiate transformers and classifier
    vect = CountVectorizer(tokenizer= tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier(random_state= 0)

    # Fit and/or transform each to the data
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)
    clf.fit(X_train_tfidf, y_train)
    
    # Transform test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    # Predict test labels
    y_test_pred = clf.predict(X_test_tfidf)
    
    display_results(y_test, y_test_pred)

In [144]:
main_naive()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 18   5  84]
 [  5   0  23]
 [ 67  28 371]]
Accuracy: 0.647


### Advantages of Using Pipeline:   
1. Simplicity and Convencience   
2. Optimizing Entire Workflow, including data transformation and modeling steps. 
3. Preventing Data leakage   


## 3. Implementing Pipeline

In [145]:
def main_pipe():
    # Load the data
    X, y = load_data()
    # Split data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 6)

    # Build pipeline
    pipe = Pipeline(steps = [
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier(random_state= 0))
    ])
        
    # Train classifier
    pipe.fit(X_train, y_train)

    # Predict on test data
    y_test_pred = pipe.predict(X_test)

    # Display results
    display_results(y_test, y_test_pred)

In [146]:
main_pipe()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 18   5  84]
 [  5   0  23]
 [ 67  28 371]]
Accuracy: 0.647


## 4. Pipelines And Feature Unions
- A `pipeline` performs a list of steps in a _linear sequence_, while a `feature union` performs a list of steps in _parallel_ and then combines their results.
- In more complex workflows, multiple feature unions are often used within pipelines, and multiple pipelines are used within feature unions.