## Travel Domain Question Classification
Following program classifies questions related to travel domain using LSTM.

Imports the required libraries and the data file

In [1]:
import pandas as pd
import re
import numpy as np
import nltk
import spacy
from nltk.corpus import stopwords
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

nlp = spacy.load('en')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
!pip install fasttext

print('----- Importing dataset -----')
d_file = open('5000TravelQuestionsDataset.csv', encoding="latin-1")

df = pd.read_csv(d_file, header=None)
df.columns = ['text', 'class1', 'class2']

print ('Training Data : Imported Rows, Columns - ', df.shape)
print ('Data Preview :')
df.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████████████████████████████████| 71kB 3.3MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3043136 sha256=b6a1191eacd146bc9d221002a4920d3cd6f753d8e1ecd75b78c3fe79cd24018e
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154b75231136cc3a3321ab0e30f592
Successfully built fasttext
Installing collected packages:

Unnamed: 0,text,class1,class2
0,What are the special things we (husband and me...,TTD,TTDSIG
1,What are the companies which organize shark fe...,TTD,TTDOTH
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA
3,What are the best places around Cape Town for ...,TTD,TTDSIG
4,What are the best places to stay for a family ...,ACM,ACMOTH


The following section performs the text normalizing steps by converting text to lower case, removing leading and trailing whitespaces, removing stop words and also lematizing.

In [None]:
# Text normalization steps
import string

# Function to remove stop words
def remove_stopwords(text):
  text_tokens = nltk.word_tokenize(text) 
  filtered_sentence = [word for word in text_tokens if not word in stopwords.words()] 
  return " ".join(filtered_sentence)

# Converting to lower case 
df['processed_text'] = df['text'].str.lower()
# Removing punctuations
translator = str.maketrans('', '', string.punctuation)
df['processed_text'] = df['processed_text'].str.translate(translator)
# Removing leading ending white spaces
df['processed_text'] = df['processed_text'].str.strip()
# Remove stop words
df['processed_text'] = df.processed_text.apply(remove_stopwords)

lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
  lemmatized = [lemmatizer.lemmatize(word, pos="v") for word in nltk.word_tokenize(text)]
  return ' '.join(lemmatized)

df['processed_text'] = df.processed_text.apply(lemmatize_text)



In [8]:
# Removing leading ending white spaces
df['class1'] = df['class1'].str.strip()
df['class2'] = df['class2'].str.strip()

df.head()

Unnamed: 0,text,class1,class2,processed_text
0,What are the special things we (husband and me...,TTD,TTDSIG,special things husband 5 day stay cape town
1,What are the companies which organize shark fe...,TTD,TTDOTH,company organize shark feed events scuba divers
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA,safe female traveller go alone cape town
3,What are the best places around Cape Town for ...,TTD,TTDSIG,best place around cape town safari
4,What are the best places to stay for a family ...,ACM,ACMOTH,best place stay family stay away nightlife


# Data Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_encoded = le.fit_transform(df['class1'])
y_encoded_1 = le.fit_transform(df['class2'])

In [11]:
max_words = 5000
max_len = 25
tok = Tokenizer(num_words=max_words, split=' ')
tok.fit_on_texts(df.processed_text.values)
seqs = tok.texts_to_sequences(df.processed_text.values)
seqs_mat = sequence.pad_sequences(seqs,maxlen=max_len)

In [32]:
seqs_mat.shape

(5000, 25)

# Model

In [17]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import SpatialDropout1D
def MODEL_LSTM():
    model = Sequential()
    model.add(Embedding(5000, 160, input_length=seqs_mat.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(7, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [14]:

y_mat = pd.get_dummies(df['class1']).values
y_mat.shape

(5000, 7)

In [33]:
y_mat_1 = pd.get_dummies(df['class2']).values
y_mat_1.shape

(5000, 63)

# K-Fold classification and Accuracy Report

In [71]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


def accuracy_report(y_test, y_pred):
    
    print(classification_report(y_test,y_pred))

    acc = accuracy_score(y_test, y_pred)*100
    print('Accuracy : %.3f' % acc)

    f1 =  f1_score(y_test, y_pred, average='weighted')
    print('F1 Score: %.3f' % f1)

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix: \n{}".format(cm))
    
    return f1
    


In [23]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


def accuracy_summary(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred)*100
    print('Accuracy score: %.3f' % acc)

    f1 =  f1_score(y_test, y_pred, average='weighted')
    print('F1 Score: %.3f' % f1)

    return f1

In [72]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(seqs):
  fold += 1
  X_train, X_test = seqs_mat[train_index], seqs_mat[test_index]
  y_train, y_test = y_mat[train_index], y_mat[test_index]

  model = MODEL_LSTM()
  model.fit(X_train, y_train, epochs=5, batch_size=64,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

  predictions = model.predict(X_test)

  fine_pred = [np.argmax(p) for p in predictions]
  fine_gt = [np.argmax(p) for p in y_test]
  f1 = accuracy_report(fine_pred, fine_gt)

  accuracies.append(f1)
  
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.77      0.81      0.78        77
           1       0.60      0.65      0.63        23
           2       0.80      0.87      0.83        52
           3       0.82      0.74      0.78       118
           4       0.93      0.86      0.89       104
           5       0.75      0.80      0.78       111
           6       0.72      0.87      0.79        15

    accuracy                           0.80       500
   macro avg       0.77      0.80      0.78       500
weighted avg       0.80      0.80      0.80       500

Accuracy : 80.000
F1 Score: 0.801
Confusion Matrix: 
[[62  0  2  7  1  5  0]
 [ 2 15  2  0  0  3  1]
 [ 4  3 45  0  0  0  0]
 [ 5  1  5 87  3 15  2]
 [ 3  2  0  5 89  5  0]
 [ 5  4  2  6  3 89  2]
 [ 0  0  0  1  0  1 13]]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.88   

New model LSTM

In [69]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import SpatialDropout1D
def NEW_MODEL_LSTM():
    model = Sequential()
    model.add(Embedding(5000, 160, input_length=seqs_mat_1.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(63, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [73]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(seqs_1):
  fold += 1
  X_train, X_test = seqs_mat[train_index], seqs_mat[test_index]
  y_train, y_test = y_mat_1[train_index], y_mat_1[test_index]

  model = NEW_MODEL_LSTM()
  model.fit(X_train, y_train, epochs=5, batch_size=128,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

  predictions = model.predict(X_test)

  fine_pred = [np.argmax(p) for p in predictions]
  fine_gt = [np.argmax(p) for p in y_test]
  f1 = accuracy_report(fine_pred, fine_gt)

  accuracies.append(f1)
  
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.47      0.73      0.58        26
           5       0.77      0.59      0.67        29
           6       0.71      0.62      0.67         8
           7       0.00      0.00      0.00         0
           8       0.38      0.20      0.26        15
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.84      0.54      0.66        39
          15       0.50      0.29      0.36         7
          16       0.00      0.00      0.00         0
          17       0.00    