# Machine Learning Workflow
Following the machine learning workflow to build a classifier for corporate messages.

Starting with the review of works on 01_clean_tokenize.ipynb.

In [36]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to /Users/jsuk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jsuk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [37]:
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
# from sklearn.svm import LinearSVC
# from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [38]:
# Build load_data and tokenize function


def load_data() :
    ''' File path: ./corporate_messaging.csv -- latin-1 encoding
        Open the file, clean the data and return the variables 
        Variable X: text, y: category) '''

    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    
    # 1. Exlcude unmeaningful category 'Exclude'
    # 2. Use the data with confidence of 100%
    df = df[(df['category'] != 'Exclude') & (df['category:confidence'] == 1)]
    
    X = df['text']
    y = df['category']
    
    return X, y


def tokenizer(text) : 
    '''To be applied to one text at a time inside iteration 
    or be included in CounterVectorizer method as a parameter
    '''
    
    # Normalization : lowercase, remove punctuation and whitespace
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower().strip())
    
    # Replace url with 'urlplaceholder' string
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    # Tokenize, lemmatize the text
    tokens = []
    lemmatizer = WordNetLemmatizer()
    
    for token in word_tokenize(text) : 
        if token not in stopwords.words('english') :
            tokens.append(lemmatizer.lemmatize(token))
    
    return tokens
    

### Step 1: Load data and perform a train test split

In [4]:
# load data
X, y = load_data()

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [5]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenizer)
tfidf = TfidfTransformer()
clf = LogisticRegression() # fit logistic regression first

# Fit and/or transform each to the data
X_train_counts = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
clf.fit(X_train_tfidf, y_train)

LogisticRegression()

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [6]:
# Transform test data
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)

# Predict test labels
y_pred = clf.predict(X_test_tfidf)

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [16]:
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_pred, y_test)
accuracy = accuracy_score(y_pred, y_test)

print("Labels:", labels, end='\n\n')
print("Confusion Matrix:\n", confusion_mat, end='\n\n')
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']

Confusion Matrix:
 [[ 80   2   1]
 [  0  22   0]
 [ 57  28 603]]

Classification Report:
               precision    recall  f1-score   support

      Action       0.58      0.96      0.73        83
    Dialogue       0.42      1.00      0.59        22
 Information       1.00      0.88      0.93       688

    accuracy                           0.89       793
   macro avg       0.67      0.95      0.75       793
weighted avg       0.94      0.89      0.90       793


Accuracy: 0.8890290037831021


Or print out all at once with one line code

In [18]:
clf_report = classification_report(y_test, y_pred)
print("Classification Report:\n", clf_report, end='\n\n')

Classification Report:
               precision    recall  f1-score   support

      Action       0.96      0.58      0.73       137
    Dialogue       1.00      0.42      0.59        52
 Information       0.88      1.00      0.93       604

    accuracy                           0.89       793
   macro avg       0.95      0.67      0.75       793
weighted avg       0.90      0.89      0.88       793




# Final Step: Refactor
Organize these steps into the following functions.

In [51]:
def display_results(y_test, y_pred) :
    clf_report = classification_report(y_test, y_pred)
    print("Classification Report:\n", clf_report, end='\n\n')
    
def main() : 
    
    # Load the data and split train, test set
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    # Instantiate estimators
    # TfidfVectorizer combines CountVectorizer and TfidfTransformer
    tfidf = TfidfVectorizer(tokenizer=tokenizer)
    clf = LogisticRegression() 
    
    # Train the data
    X_train_tfidf = tfidf.fit_transform(X_train)
    clf.fit(X_train_tfidf, y_train)
    
    # Transform X_test and run prediction
    # Be careful not to fit again
    X_test_tfidf = tfidf.transform(X_test)
    y_pred = clf.predict(X_test_tfidf)
    
    display_results(y_test, y_pred)
    

In [52]:
# run program
main()

Classification Report:
               precision    recall  f1-score   support

      Action       0.93      0.62      0.74       115
    Dialogue       1.00      0.40      0.57        40
 Information       0.87      0.99      0.93       446

    accuracy                           0.88       601
   macro avg       0.93      0.67      0.75       601
weighted avg       0.89      0.88      0.87       601


