In [1]:
# READ DATA
import spacy
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('data/data.csv')
df.head()

Unnamed: 0,intent,text
0,greet,hi
1,greet,hello
2,greet,morning!
3,goodbye,bye!
4,goodbye,see you


In [9]:
import numpy as np

# PRE-PROCESSING
X_as_text = df['text']
y_as_text = df['intent']

# Tokenize the words for TRAIN
nlp = spacy.load('en_core_web_lg')

n_queries=len(X_as_text)
dim_embedding = nlp.vocab.vectors_length
X = np.zeros((n_queries, dim_embedding))

for idx, sentence in enumerate(X_as_text):
    doc = nlp(str(sentence))
    X[idx, :] = doc.vector
    
# Encode labels
le = LabelEncoder()
y = le.fit_transform(y_as_text)

print(le.classes_)

[1 1 1 0 0 2 2 2] ['goodbye' 'greet' 'restaurant_search']


In [12]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=0).fit(X, y)

classifier.predict(np.array(nlp('hi there').vector).reshape(1, -1))
classifier.predict_proba(np.array(nlp('hi there').vector).reshape(1, -1))




array([[0.14679797, 0.58958513, 0.2636169 ]])

In [57]:
# Search for the best model
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

defaults = {                              
    "C": [1, 2, 5, 10, 20, 100],
    "kernels": ["linear"], "max_cross_validation_folds": 5
}

C = defaults["C"]                       
kernels = defaults["kernels"]

tuned_parameters = [{"C": C, "kernel": [str(k) for k in kernels]}]                                                                 
folds = defaults["max_cross_validation_folds"]
cv_splits = max(2, min(folds, np.min(np.bincount(y)) // 5))

svc = svm.SVC(C=1, probability=True, class_weight='balanced')
clf = GridSearchCV( 
    svc,
    param_grid=tuned_parameters, n_jobs=1, cv=cv_splits,         
    scoring='f1_weighted', verbose=1)
clf.fit(X, y)

print(sorted(clf.cv_results_.keys()))
print("Accuracy on Test Set: ",np.count_nonzero(clf.predict(X_test)==y_test)/len(y_test))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_C', 'param_kernel', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
Accuracy on Test Set:  1.0


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s finished


In [64]:
# Intent classification

X_test = np.array(nlp('hi there').vector).reshape(1, -1)                   
pred_result = clf.predict(X_test)
print(pred_result)

le.inverse_transform(pred_result)

[1]


array(['greet'], dtype='<U17')

In [72]:
# Extract Entities

# Option 1 - Use spaCy, it should be sufficient to what we want to do
# but for sure we are going to need to train it better for improved outcome --> https://spacy.io/usage/training#training-data
# In the example below you can see that it did not recognize restaurant as a NOUN
doc = nlp('Suggest me some mexican restaurant')

for ent in doc.ents:
    print(ent, ent.label_)


# Option 2 - A specialized-recognizer by using sklearn https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html

mexican NORP
