# Travel Domain Question Classification
Following program classifies questions related to travel domain using fasttext word embedding.

Imports the required libraries and the data file

In [1]:
import pandas as pd
import re
import numpy as np
import nltk
import spacy
from nltk.corpus import stopwords

nlp = spacy.load('en')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
!pip install fasttext

print('----- Importing dataset -----')
d_file = open('5000TravelQuestionsDataset.csv', encoding="latin-1")

df = pd.read_csv(d_file, header=None)
df.columns = ['text', 'class1', 'class2']

print ('Training Data : Imported Rows, Columns - ', df.shape)
print ('Data Preview :')
df.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████████████████████████████████| 71kB 3.5MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3043117 sha256=38587789c4c9547f2ae45784ec729fcd081f77fa7e8b9bea68df635c35766338
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154b75231136cc3a3321ab0e30f592
Successfully built fasttext
Installing collected packages:

Unnamed: 0,text,class1,class2
0,What are the special things we (husband and me...,TTD,TTDSIG
1,What are the companies which organize shark fe...,TTD,TTDOTH
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA
3,What are the best places around Cape Town for ...,TTD,TTDSIG
4,What are the best places to stay for a family ...,ACM,ACMOTH


The following section performs the text normalizing steps by converting text to lower case, removing leading and trailing whitespaces, removing stop words and also lematizing.

In [2]:
# Text normalization steps
import string

# Function to remove stop words
def remove_stopwords(text):
  text_tokens = nltk.word_tokenize(text) 
  filtered_sentence = [word for word in text_tokens if not word in stopwords.words()] 
  return " ".join(filtered_sentence)

# Converting to lower case 
df['processed_text'] = df['text'].str.lower()
# Removing punctuations
translator = str.maketrans('', '', string.punctuation)
df['processed_text'] = df['processed_text'].str.translate(translator)
# Removing leading ending white spaces
df['processed_text'] = df['processed_text'].str.strip()
# Remove stop words
df['processed_text'] = df.processed_text.apply(remove_stopwords)

lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
  lemmatized = [lemmatizer.lemmatize(word, pos="v") for word in nltk.word_tokenize(text)]
  return ' '.join(lemmatized)

df['processed_text'] = df.processed_text.apply(lemmatize_text)

# Removing leading ending white spaces
df['class1'] = df['class1'].str.strip()
df['class2'] = df['class2'].str.strip()

df.head()


Unnamed: 0,text,class1,class2,processed_text
0,What are the special things we (husband and me...,TTD,TTDSIG,special things husband 5 day stay cape town
1,What are the companies which organize shark fe...,TTD,TTDOTH,company organize shark feed events scuba divers
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA,safe female traveller go alone cape town
3,What are the best places around Cape Town for ...,TTD,TTDSIG,best place around cape town safari
4,What are the best places to stay for a family ...,ACM,ACMOTH,best place stay family stay away nightlife


In [3]:
def tokenize(text):
    temp = nlp(text)
    return [str(token) for token in temp if not token.is_stop]

In [4]:
tokenized = [tokenize(text) for text in df.processed_text]

# FastText

In [6]:
import fasttext.util

fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')

ft.get_dimension()

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz





300

In [7]:
def get_sentence_embedding(wordlist):
  embedding=ft.get_sentence_vector(wordlist)
  return embedding

In [8]:
embeddings = [np.mean(np.array(list(map(get_sentence_embedding,token))),axis=0) for token in tokenized]


In [10]:
# x_embed = [np.mean(np.array(list(map(get_sentence_embedding,token))),axis=0) for token in embeddings]
x_encoded = np.array(embeddings)
x_encoded.shape

(5000, 300)

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_encoded = le.fit_transform(df['class1'])
y_encoded_1 = le.fit_transform(df['class2'])

# K-Fold classification and Accuracy Report

In [36]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def calculate_accuracy(encoded_x, encoded_y):
  cv = KFold(n_splits=10, random_state=1, shuffle=True)
  fold = 0
  accuracies = []
  for train_index, test_index in cv.split(encoded_x):
      fold += 1
      X_train, X_test = encoded_x[train_index], encoded_x[test_index]
      y_train, y_test = encoded_y[train_index], encoded_y[test_index]
      SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
      SVM.fit(X_train,y_train)
      predictions_y = SVM.predict(X_test)
      acc = accuracy_score(predictions_y, y_test)*100
      accuracies.append(acc)
      print("K-Fold: {} - {} - {:.2f}".format(fold, "Accuracy: ",acc))
      
  print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))
  return y_test, predictions_y

In [14]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


def accuracy_report(y_test, y_pred):
    
    print(classification_report(y_test,y_pred))

    acc = accuracy_score(y_test, y_pred)*100
    print('Accuracy : %.3f' % acc)

    print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))
    
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix: \n{}".format(cm))

In [37]:
y_return = calculate_accuracy(x_encoded, y_encoded)

K-Fold: 1 - Accuracy:  - 75.80
K-Fold: 2 - Accuracy:  - 77.00
K-Fold: 3 - Accuracy:  - 76.40
K-Fold: 4 - Accuracy:  - 77.60
K-Fold: 5 - Accuracy:  - 77.60
K-Fold: 6 - Accuracy:  - 77.40
K-Fold: 7 - Accuracy:  - 75.20
K-Fold: 8 - Accuracy:  - 78.00
K-Fold: 9 - Accuracy:  - 78.40
K-Fold: 10 - Accuracy:  - 79.00
Mean 77.24 Std 1.11


In [38]:
accuracy_report(y_return[0], y_return[1])

              precision    recall  f1-score   support

           1       0.86      0.79      0.82        76
           2       0.80      0.35      0.48        23
           3       0.94      0.82      0.87        60
           4       0.78      0.76      0.77       112
           6       0.86      0.88      0.87        98
           7       0.64      0.82      0.72       114
           9       1.00      0.82      0.90        17

    accuracy                           0.79       500
   macro avg       0.84      0.75      0.78       500
weighted avg       0.80      0.79      0.79       500

Accuracy : 79.000
F1 Score: 0.789
Confusion Matrix: 
[[60  0  2  6  0  8  0]
 [ 1  8  0  3  1 10  0]
 [ 1  1 49  2  0  7  0]
 [ 3  0  0 85  6 18  0]
 [ 0  1  0  3 86  8  0]
 [ 4  0  1  9  7 93  0]
 [ 1  0  0  1  0  1 14]]


In [39]:
y_return = calculate_accuracy(x_encoded, y_encoded_1)

K-Fold: 1 - Accuracy:  - 53.80
K-Fold: 2 - Accuracy:  - 52.20
K-Fold: 3 - Accuracy:  - 52.80
K-Fold: 4 - Accuracy:  - 52.40
K-Fold: 5 - Accuracy:  - 52.20
K-Fold: 6 - Accuracy:  - 52.00
K-Fold: 7 - Accuracy:  - 50.20
K-Fold: 8 - Accuracy:  - 53.40
K-Fold: 9 - Accuracy:  - 53.80
K-Fold: 10 - Accuracy:  - 52.20
Mean 52.50 Std 1.00


In [40]:
accuracy_report(y_return[0], y_return[1])

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         8
           7       0.48      0.70      0.57        40
           8       0.47      0.38      0.42        21
          10       1.00      0.20      0.33         5
          11       0.00      0.00      0.00         1
          12       1.00      0.38      0.55         8
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         7
          16       0.00      0.00      0.00         3
          18       0.50      0.82      0.62        22
          20       1.00      0.56      0.72        16
          21       0.00      0.00      0.00         3
          22       0.00      0.00      0.00         2
          23       0.50      1.00      0.67         1
          24       0.00      0.00      0.00         4
          25       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
