# INTENT CHARACTERIZATION

TO DO :  
    * track pipeline w/ airflow  
* **workload**: 
    * 4 hours

In [None]:
import pandas as pd
import time
import numpy as np

# visualization
from matplotlib import pyplot as plt

# preprocessing
import spacy
from spacy import tokenizer
from spacy.lang.en import English

# exploration
import re
from ipywidgets import interact

In [None]:
# display
pd.set_option("display.max_colwidth", 10)

In [None]:
proj_path = "/Users/steeve_laquitaine/desktop/CodeHub/intent/intent/"
train_data_path = proj_path + "data/01_raw/banking77/train.csv"
test_data_path = proj_path + "data/01_raw/banking77/test.csv"

In [None]:
train_data  = pd.read_csv(train_data_path)

In [None]:
# preview
train_data.head(5)

In [None]:
print('{} unique labels'.format(train_data.category.nunique()))
print(train_data.category.unique())

In [None]:
tic = time.time()
train_data.category.value_counts().plot(
    kind='bar', 
    # y=train_data.category.value_counts().index, 
    # x=train_data.category.value_counts().values, 
    figsize=(15,2)
    );
print('Took {} secs'.format(np.round(time.time()-tic,2)))    

In [None]:
# horizontal plot
tic = time.time()
train_data.category.value_counts().plot.barh(
    figsize=(15,2)
    );
print('Took {} secs'.format(np.round(time.time()-tic,2)))    

In [None]:
# list labels sorted by occurrence frequency
intents_by_popularity = train_data.category.value_counts().index.tolist()
intents_by_popularity[:10]

In [None]:
# show top intent requests
top_label_data = train_data.text.loc[train_data.category==intents_by_popularity[0]]
top_label_data.tolist()[:10]

# Explore vocabulary

In [None]:
# convert data Series to one data String
data_string = " ".join(top_label_data.to_list())

In [None]:
# tokenize text
def tokenize_txt(data_string, tokenizer, English):
    tokenize = tokenizer.Tokenizer(English().vocab)
    tokens = tokenize(data_string)
    print("{} tokens".format(len(tokens)))
    return tokens
tokens = tokenize_txt(data_string, tokenizer, English)        

In [None]:
print(list(tokens[:100]))

# Describe requests

* Clause features:

  1. **Type**  
    
        * interrogative (1)
            * closed
            * open
        * declarative
        * imperative (2)
            * wishes  
            * orders  
        * exclamative
        
  2. **Length**

  3. **Structural complexity (3)**  
        * simple  
        * compound    
        * complex  
        * compound-complex  

In [None]:
top_intent = intents_by_popularity[0]
top_intent_text = train_data.text[train_data.category.eq(top_intent)].tolist()
top_intent_text[:10]

### Sentence types

In [None]:
# either ? or ! or .
SENT_TYPE_PATTN = re.compile(r"[\?\!\.]")

def classify_mood(sentences):
    """
    Classify sentence type
    """
    sent_type = []
    for sent in sentences:
        out = SENT_TYPE_PATTN.findall(sent)
        sent_type.append(['ask' if ix=='?' else 'wish-or-excl' if ix=='!' else 'state' for ix in out])        
    return sent_type


def detect_sentence_type(df, sent_type:str):
    """
    Detect sentence types

    parameters
    ----------
    sent_type: str
        'state', 'ask', 'wish-excl' 
    """
    return sent_type in df

sentence_type = classify_mood(train_data.text.tolist())
train_data_feat = train_data.copy()
train_data_feat["sentence_type"] = sentence_type

In [None]:
# filter sentences by "type"
TYPE = 'ask'
filtered = train_data_feat[train_data_feat.apply(lambda x: detect_sentence_type(x.sentence_type, TYPE), axis=1)]
filtered

In [None]:
sent_types = ['state', 'ask', 'wish-or-excl']
data = train_data_feat

button = widgets.Button(description="Click Me!")
display(button)

@interact(WINDOW_START=(0, 100))
def show_text_SideBySide(WINDOW_START):
    """
    Show texts in dataframe side by side  
    
    INPUT:
    -----
    sent_types: list
    data: pd.DataFrame

    RETURN:    
    ------
    ipywidget object to display 
        display(show_text_SideBySide( ... ))
    """    
    pd.set_option("display.max_colwidth", -1)   #show entire text  
    widget_all = []
    count = -1
    while count < len(sent_types)-1:        
        count += 1        
        widget_all.append(widgets.Output())    #init Outputs widgets
        selected_df = data[['text']][  #select data by sent_type
            data.apply(
                lambda x: detect_sentence_type(
                    x.sentence_type, sent_types[count]
                    ), 
                    axis=1
                )
            ].iloc[WINDOW_START:WINDOW_START+10]
        with widget_all[count]: 
            clear_output(True)
            # print("Button clicked.",round(random.uniform(0, 1),3))
            display(selected_df)
    return widgets.HBox(widget_all)

button.on_click(show_text_SideBySide)

In [None]:
# from IPython.display import clear_output
# button = widgets.Button(description="Click Me!")
# out = widgets.Output()
# display(button)
# display(out)

# def on_button_clicked(b):
#     with out:
#         clear_output(True)
#         print("Button clicked.",round(random.uniform(0, 1),3))

# button.on_click(on_button_clicked)


# References

(1) https://allthingslinguistic.com/post/160783915317/being-a-declarative-or-interrogative-or  
(2) https://oxford.universitypressscholarship.com/view/10.1093/acprof:oso/9780199283613.001.0001/acprof-9780199283613-chapter-6   
(3) Fareh, S., & Moussa, M. B. (2008). Pragmatic Functions of Interrogative Sentences in English: A Corpus-based Study. International Journal of Arabic-English Studies, 9(1), 145-164.   