# Intent discovery in the Banking77 dataset  
Intent analysis

* Build an NLU component
    * CKY parser (dynamic programming, CFG algo)
        * TO DO: search for production-grade library

* Impact
    * dataset automatic labelling --> reduce labor intensive annotation 

* Intent analysis (3):
    * intention: "want to understand" 
    * intended object: "the human mind"
    * intendee: "Neuroscientists"

* Exploit linguistics knowledge 
    * grammar rules

* **Workload**
  * prep to clustering (1 hour)

## Setup  
### Dependencies

In [17]:
import os
import pandas as pd

# ml.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.cluster import KMeans

# text prep.
import nltk
import re
import numpy as np
nltk.download('punkt') # 13 MB zip containing pretrained punkt sentence tokenizer (Kiss and Strunk, 2006)
import time

# data struct. utils
from collections import defaultdict

# EDA
from ipywidgets import interact

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/steeve_laquitaine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Paths

In [2]:
proj_path = "/Users/steeve_laquitaine/desktop/CodeHub/intent/intent/"
train_data_path = proj_path + "data/01_raw/banking77/train.csv"
test_data_path = proj_path + "data/01_raw/banking77/test.csv"

### Parameters

In [66]:
params = defaultdict()

# tf-idf
params["tfidf"] = defaultdict()
params["tfidf"]["MIN_DF"] = 10
params["tfidf"]["MAX_DF"] = 0.8

# stop words
params["stop_words"] = stop_words

# kmeans
params["kmeans"] = defaultdict()
params["kmeans"]["NUM_CLUSTERS"] = 6
params["kmeans"]["max_iter"] = 1000
params["kmeans"]["n_init"] = 50
params["kmeans"]["random_state"] = 42

## Load query dataset

In [3]:
# train and test
train_data  = pd.read_csv(train_data_path)
test_data  = pd.read_csv(test_data_path)

In [45]:
# show
train_data.head(5)

Unnamed: 0,text,intent
0,I am still waiting on my card?,card_arrival
1,What can I do if my card still hasn't arrived ...,card_arrival
2,I have been waiting over a week. Is the card s...,card_arrival
3,Can I track my card while it is in the process...,card_arrival
4,"How do I know if I will get my card, or if it ...",card_arrival


In [46]:
# preview
test_data.head(5)

Unnamed: 0,text,intent
0,How do I locate my card?,card_arrival
1,"I still have not received my new card, I order...",card_arrival
2,I ordered a card but it has not arrived. Help ...,card_arrival
3,Is there a way to know when my card will arrive?,card_arrival
4,My card has not arrived yet.,card_arrival


### Normalize headers

In [6]:
def standardize_col_names(data:pd.DataFrame):
    return data.rename(columns={"text":"text","category":"intent"})

In [7]:
train_data = standardize_col_names(train_data)
test_data = standardize_col_names(test_data)

### Describe 

In [8]:
print("\nValue count:\n")
print(train_data.count())
print("\nUnique values:\n")
print(train_data.nunique())


Value count:

text      10003
intent    10003
dtype: int64

Unique values:

text      10003
intent       77
dtype: int64


In [9]:
train_data.head(1)

Unnamed: 0,text,intent
0,I am still waiting on my card?,card_arrival


# QUERY CLUSTERING

## Prep. calls

In [28]:
# prep
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc:list):
    """
    Normalize document

    parameters:
    ---------
    doc

    return
    ------
    doc

    """
    # lower case and drop special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    
    # tokenize
    tokens = nltk.word_tokenize(doc)
    
    # drop stop words
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # re-create doc from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

# time
tic = time.time()

# vectorize doc
normalize_corpus = np.vectorize(normalize_document)

# normalize doc
norm_corpus = normalize_corpus(list(train_data['text']))
len(norm_corpus)
print(f"(normalize_document) took: {round(time.time()-tic,2)} secs")

# show
print("\nPreview:")

norm_corpus

(normalize_document) took: 2.0 secs

Preview:


array(['still waiting card', 'card still hasnt arrived 2 weeks',
       'waiting week card still coming', ..., 'countries getting support',
       'cards available eu', 'countries represented'], dtype='<U309')

## Vectorize queries as B-O-W

In [58]:
tic = time.time()
cv = CountVectorizer(
    ngram_range=(1, 2), 
    min_df=params["tfidf"]["MIN_DF"], 
    max_df=params["tfidf"]["MAX_DF"], 
    stop_words=params["stop_words"]
    )
cv_matrix = cv.fit_transform(norm_corpus)
print(f"(vectorization:tf-idf) shape:{cv_matrix.shape}, took {round(time.time()-tic,2)} secs")

(vectorization:tf-idf) shape:(10003, 1205), took 0.24 secs


## Cluster queries w/ K-Means

In [70]:
tic = time.time()
km = KMeans(
    n_clusters=params["kmeans"]["NUM_CLUSTERS"], 
    max_iter=params["kmeans"]["max_iter"], 
    n_init=params["kmeans"]["n_init"], 
    random_state=params["kmeans"]["random_state"],
    ).fit(cv_matrix)
print(f"(clustering:kmeans) model:{km}, took {round(time.time()-tic,2)} secs")

(clustering:kmeans) model:KMeans(max_iter=1000, n_clusters=6, n_init=50, random_state=42), took 3.72 secs


In [88]:
# Describe
# count reviews ("docs") per label
from collections import Counter
Counter(km.labels_)

Counter({1: 2431, 0: 3972, 3: 987, 5: 538, 4: 1227, 2: 848})

In [91]:
kmeans_labelled_train_data = train_data.copy(deep=True)
kmeans_labelled_train_data['kmeans_label'] = km.labels_ 
kmeans_labelled_train_data.head()

Unnamed: 0,text,intent,kmeans_label
0,I am still waiting on my card?,card_arrival,1
1,What can I do if my card still hasn't arrived ...,card_arrival,1
2,I have been waiting over a week. Is the card s...,card_arrival,1
3,Can I track my card while it is in the process...,card_arrival,1
4,"How do I know if I will get my card, or if it ...",card_arrival,1


# REVERSE ENGINEERING HUMAN ANNOTATIONS

## Explore CLUSTERS

### Explore clusters' requests

In [190]:
# interactive
def show_requests(LABEL, text_ix):
    VIEW_WINDOW = 10
    return kmeans_labelled_train_data[kmeans_labelled_train_data.kmeans_label.eq(LABEL)].text.iloc[text_ix:text_ix+VIEW_WINDOW];

interact(
    show_requests, 
    LABEL=np.unique(km.labels_), 
    text_ix=(0, len(kmeans_labelled_train_data))
    );

interactive(children=(Dropdown(description='LABEL', options=(0, 1, 2, 3, 4, 5), value=0), IntSlider(value=5001…

### Reduce requests to 3D statements (intendee,intent,intendeed)

* Reduce request's dimenionality and plot
    * transform to triple: (intendee, intent, indended)
* Grammatical knowledge
    * Interrogative
        * What?
        * Why?
        * How? : know_how
        * Where? : get_location
        * Who?
    * declarative 
        * passive
        * active
    * exclamative
    * imperative

# References

(1) https://www.nltk.org/_modules/nltk/ccg/chart.html  
(2) https://github.com/dipanjanS/text-analytics-with-python/blob/master/New-Second-Edition/Ch07 - Text Similarity and Clustering/Ch07c - Document Clustering.ipynb   
(3) https://www.martechvibe.com/insights/staff-articles/how-intent-analysis-can-help-programmatic-advertising/#:~:text=Intent analysis is a step,user's intention behind the message.  