In [1]:
import pandas as pd
from random import sample
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [2]:
# read csv file
data = pd.read_csv("intent1.csv")
print(data.sample(6))

    intent_label                                        Description
12             1  substantially infirmary for centre operating_t...
19             1                trump hospital for spunk operation 
11             1        substantially hospital for spunk operation 
13             1      substantially infirmary for centre operation 
21             1              trump infirmary for centre operation 
9              1       substantially hospital for centre operation 


In [55]:
# split dataset into train and test.
X_train, X_test, Y_train, Y_test = train_test_split(data
["Description"], data["intent_label"], test_size=3)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(29,) (3,) (29,) (3,)


TF IDF

In [4]:
# vectorize the input using tfidf values.
tfidf = TfidfVectorizer()
tfidf = tfidf.fit(X_train)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

In [5]:
# label encoding for different categories of intents
le = LabelEncoder().fit(Y_train)
Y_train = le.transform(Y_train)
Y_test = le.transform(Y_test)

In [6]:
# other models like GBM, Random Forest may also be used
model = SVC()
model = model.fit(X_train, Y_train)
p = model.predict(X_test)



In [10]:
# calculate the f1_score. average="micro" since we want to calculate score for multiclass.
# Each instance(rather than class(search for macro average)) contribute equally towards the scoring.
print("f1_score:", f1_score( Y_test, p, average="micro"))
print("accuracy_score:",accuracy_score(Y_test, p))

f1_score: 0.6666666666666666
accuracy_score: 0.6666666666666666


Word2Vec

In [46]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [47]:
# read data
data = pd.read_csv("intent1.csv")

# split data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(data["Description"], data["intent_label"], test_size=6)

# label encoding for different categories of intents
le = LabelEncoder().fit(Y_train)
Y_train = le.transform(Y_train)
Y_test = le.transform(Y_test)

In [48]:
# get word_vectors for words in training set
X_train = [sent for sent in X_train]
X_test = [sent for sent in X_test]

word_vecs = Word2Vec(X_train)
print("Word vectors trained")

# prune each sentence to maximum of 20 words.
max_sent_len = 20

Word vectors trained


In [49]:
# tokenize input strings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index
vocab_size = len(word_index)

# sentences with less than 20 words, will be padded with zeroes to make it of length 20
# sentences with more than 20 words, will be pruned to 20.
x = pad_sequences(sequences, maxlen=max_sent_len)
X_test = pad_sequences(sequences_test, maxlen=max_sent_len)
    
# 100 is the size of wordvec.
embedding_matrix = np.zeros((vocab_size + 1, 100))

In [50]:
# make matrix of each word with its word_vectors for the CNN model. 
# so each row of a matrix will represent one word. There will be a row for each word in
# the training set
for word, i in word_index.items():
        try:
            embedding_vector = word_vecs[word]
        except:
            embedding_vector = None
            if embedding_vector is not None:
            	embedding_matrix[i] = embedding_vector
print("Embeddings done")
vocab_size = len(embedding_matrix)

# CNN model requires multiclass labels to be converted into one hot ecoding.
# i.e. each column represents a label, and will be marked one for corresponding label.
y = to_categorical(np.asarray(Y_train))

embedding_layer = Embedding(vocab_size,
                                100,
                                weights=[embedding_matrix],
                                input_length=max_sent_len,
                                trainable=True)
sequence_input = Input(shape=(max_sent_len,), dtype='int32')

# stack each word of a sentence in a matrix. So each matrix represents a sentence.
# Each row in a matrix is a word(Word Vector) of a sentence.
embedded_sequences = embedding_layer(sequence_input)

  


Embeddings done


In [51]:
# build the Convolutional model.
l_cov1 = Conv1D(128, 4, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(4)(l_cov1)
l_flat = Flatten()(l_pool1)
hidden = Dense(100, activation='relu')(l_flat)
preds = Dense(len(y[0]), activation='softmax')(hidden)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer='Adam')

print("model fitting - simplified convolutional neural network")
model.summary()

model fitting - simplified convolutional neural network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 20)                0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 20, 100)           2800      
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 17, 128)           51328     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 4, 128)            0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 512)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 100)               51300     
_________________________________________________________________
dense_14 (Dense)    

In [54]:
# train the model
model.fit(x, y, epochs=20, batch_size=128)

#get scores and predictions.
p = model.predict(X_test)
p = [np.argmax(i) for i in p]
score_cnn = f1_score(Y_test, p, average="micro")
print("accuracy_score:",accuracy_score(Y_test, p))
print("f1_score:", score_cnn)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
accuracy_score: 0.6666666666666666
f1_score: 0.6666666666666666
