Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import confusion_matrix, f1_score
from tensorflow.keras import models, layers, optimizers, losses, callbacks

Import Dataset

In [2]:
data_dir = "/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/"

dataset = data_dir + "Question_Classification_Dataset.csv"

Create dataframe

In [3]:
df = pd.read_csv(dataset)
print(df.head)
print(df.columns)

<bound method NDFrame.head of       Unnamed: 0                                          Questions  \
0              0  How did serfdom develop in and then leave Russ...   
1              1   What films featured the character Popeye Doyle ?   
2              2  How can I find a list of celebrities ' real na...   
3              3  What fowl grabs the spotlight after the Chines...   
4              4                    What is the full form of .com ?   
...          ...                                                ...   
5447        5447            What 's the shape of a camel 's spine ?   
5448        5448           What type of currency is used in China ?   
5449        5449                    What is the temperature today ?   
5450        5450              What is the temperature for cooking ?   
5451        5451               What currency is used in Australia ?   

         Category0 Category1 Category2  
0      DESCRIPTION      DESC    manner  
1           ENTITY      ENTY    cre

In [4]:
df = df.drop(columns = ['Unnamed: 0', 'Category1', 'Category2'])
df.head

<bound method NDFrame.head of                                               Questions     Category0
0     How did serfdom develop in and then leave Russ...   DESCRIPTION
1      What films featured the character Popeye Doyle ?        ENTITY
2     How can I find a list of celebrities ' real na...   DESCRIPTION
3     What fowl grabs the spotlight after the Chines...        ENTITY
4                       What is the full form of .com ?  ABBREVIATION
...                                                 ...           ...
5447            What 's the shape of a camel 's spine ?        ENTITY
5448           What type of currency is used in China ?        ENTITY
5449                    What is the temperature today ?       NUMERIC
5450              What is the temperature for cooking ?       NUMERIC
5451               What currency is used in Australia ?        ENTITY

[5452 rows x 2 columns]>

One Hot Encoding

In [5]:
y = pd.get_dummies(df['Category0'])
print(y)

      ABBREVIATION  DESCRIPTION  ENTITY  HUMAN  LOCATION  NUMERIC
0            False         True   False  False     False    False
1            False        False    True  False     False    False
2            False         True   False  False     False    False
3            False        False    True  False     False    False
4             True        False   False  False     False    False
...            ...          ...     ...    ...       ...      ...
5447         False        False    True  False     False    False
5448         False        False    True  False     False    False
5449         False        False   False  False     False     True
5450         False        False   False  False     False     True
5451         False        False    True  False     False    False

[5452 rows x 6 columns]


In [6]:
class_names = list(y.columns)
class_names

['ABBREVIATION', 'DESCRIPTION', 'ENTITY', 'HUMAN', 'LOCATION', 'NUMERIC']

In [7]:
# Remove html tags
def removeHTML(sentence):
    regex = re.compile('<.*?>')
    return re.sub(regex, ' ', sentence)

# Remove URLs
def removeURL(sentence):
    regex = re.compile('http[s]?://\S+')
    return re.sub(regex, ' ', sentence)

# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
    regex = re.compile('[^a-zA-Z]')
    return re.sub(regex, ' ', sentence)

In [8]:
sno = nltk.stem.SnowballStemmer('english')    # Initializing stemmer
wordcloud = [[], [], [], [], [], [], []]
all_sentences = []    # All cleaned sentences


for x in range(len(df['Questions'].values)):
    question = df['Questions'].values[x]
    classname = df['Category0'].values[x]

    cleaned_sentence = []
    sentence = removeURL(question) 
    sentence = removeHTML(sentence)
    sentence = onlyAlphabets(sentence)
    sentence = sentence.lower()   

    for word in sentence.split():
        #if word not in stop:
            stemmed = sno.stem(word)
            cleaned_sentence.append(stemmed)
            
            wordcloud[class_names.index(classname)].append(word)
            

    all_sentences.append(' '.join(cleaned_sentence))

# add as column in dataframe
X = all_sentences

Split dataset

In [9]:
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size = 0.2)

In [10]:
vocab = 1500
mlen = 200
 
tokenizer = Tokenizer(num_words = vocab, oov_token = '<UNK>')
tokenizer.fit_on_texts(Xtrain)
 
Xtrain = tokenizer.texts_to_sequences(Xtrain)
Xtrain = pad_sequences(Xtrain, maxlen=mlen)

Xval = tokenizer.texts_to_sequences(Xval)
Xval = pad_sequences(Xval, maxlen=mlen)

In [12]:
# Build and train neural network
embedding_dim = 128
 
model = models.Sequential([
    layers.Embedding(vocab, embedding_dim, input_length = mlen),
    layers.LSTM(128, activation='tanh'),
    layers.Dense(64, activation = 'relu'),
    layers.Dense(32, activation = 'relu'),
    layers.Dense(len(class_names), activation = 'softmax')
])
 
cb = [callbacks.EarlyStopping(patience = 5, restore_best_weights = True)]

model.compile(optimizer = optimizers.Adam(0.01), loss = losses.CategoricalCrossentropy(), metrics = ['accuracy'])
history = model.fit(Xtrain, ytrain, batch_size=64, epochs = 256, validation_data=(Xval, yval), callbacks = cb)

Epoch 1/256




[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 167ms/step - accuracy: 0.3773 - loss: 1.4590 - val_accuracy: 0.7754 - val_loss: 0.7339
Epoch 2/256
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 164ms/step - accuracy: 0.8446 - loss: 0.5135 - val_accuracy: 0.8213 - val_loss: 0.5634
Epoch 3/256
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 166ms/step - accuracy: 0.9069 - loss: 0.3184 - val_accuracy: 0.8249 - val_loss: 0.6148
Epoch 4/256
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 169ms/step - accuracy: 0.9397 - loss: 0.2086 - val_accuracy: 0.8579 - val_loss: 0.5665
Epoch 5/256
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 169ms/step - accuracy: 0.9565 - loss: 0.1570 - val_accuracy: 0.8570 - val_loss: 0.5364
Epoch 6/256
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 166ms/step - accuracy: 0.9653 - loss: 0.1214 - val_accuracy: 0.8478 - val_loss: 0.6215
Epoch 7/256
[1m69/69[0m [

In [13]:
model.evaluate(Xval, yval)

print("F1 score: ", f1_score(np.argmax(yval.to_numpy(), axis = 1), np.argmax(model.predict(Xval), axis = 1), average = 'weighted'))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - accuracy: 0.8619 - loss: 0.5147
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step
F1 score:  0.8568663544140677
