# Question Classification Notebook
@ author: Hatem Trigui


## Import libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\la7tim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\la7tim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\la7tim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data Loading and Parsing

In [2]:
def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            # Split label and question
            label, question = line.strip().split(' ', 1)
            data.append({"question": question, "label": label})
    return pd.DataFrame(data)

train_set = load_data(".\data\\train_set5.txt")
test_set = load_data(".\data\\test_set.txt")

print(train_set.head())


                                            question        label
0  How did serfdom develop in and then leave Russ...  DESC:manner
1   What films featured the character Popeye Doyle ?  ENTY:cremat
2  How can I find a list of celebrities ' real na...  DESC:manner
3  What fowl grabs the spotlight after the Chines...  ENTY:animal
4                    What is the full form of .com ?     ABBR:exp


## Text Preprocessing

1. Lowercased text.
2. Removed special characters and numbers using regex.
3. Tokenized text into words using nltk.
4. Removed stopwords 
5. Applied lemmatization for text normalization.

In [3]:
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

train_set['processed_question'] = train_set['question'].apply(preprocess_text)
test_set['processed_question'] = test_set['question'].apply(preprocess_text)


In [4]:
train_set['processed_question'].head()

0               serfdom develop leave russia
1       film featured character popeye doyle
2              find list celebrity real name
3    fowl grab spotlight chinese year monkey
4                              full form com
Name: processed_question, dtype: object

## Label Splitting:

Split labels (e.g., DESC:manner) into category (DESC) and specific_type (manner), making the structure of your labels more granular.

In [5]:
train_set[['category', 'specific_type']] = train_set['label'].str.split(':', expand=True)
test_set[['category', 'specific_type']] = test_set['label'].str.split(':', expand=True)

print(train_set[['label', 'category', 'specific_type']].head())


         label category specific_type
0  DESC:manner     DESC        manner
1  ENTY:cremat     ENTY        cremat
2  DESC:manner     DESC        manner
3  ENTY:animal     ENTY        animal
4     ABBR:exp     ABBR           exp


## Encoding:

Used LabelEncoder to convert textual labels (category, specific_type, and combined labels)

In [6]:
category_encoder = LabelEncoder()
specific_type_encoder = LabelEncoder()

train_set['category_encoded'] = category_encoder.fit_transform(train_set['category'])
train_set['specific_type_encoded'] = specific_type_encoder.fit_transform(train_set['specific_type'])

test_set['category_encoded'] = category_encoder.transform(test_set['category'])
test_set['specific_type_encoded'] = specific_type_encoder.transform(test_set['specific_type'])

print(train_set[['category', 'specific_type', 'category_encoded', 'specific_type_encoded']].head())


  category specific_type  category_encoded  specific_type_encoded
0     DESC        manner                 1                     23
1     ENTY        cremat                 2                      8
2     DESC        manner                 1                     23
3     ENTY        animal                 2                      1
4     ABBR           exp                 0                     16


In [7]:
train_set['combined_label'] = train_set['category'] + "_" + train_set['specific_type']
test_set['combined_label'] = test_set['category'] + "_" + test_set['specific_type']

combined_label_encoder = LabelEncoder()
train_set['combined_label_encoded'] = combined_label_encoder.fit_transform(train_set['combined_label'])
test_set['combined_label_encoded'] = combined_label_encoder.transform(test_set['combined_label'])

print(train_set[['combined_label', 'combined_label_encoded']].head())


  combined_label  combined_label_encoded
0    DESC_manner                       4
1    ENTY_cremat                       9
2    DESC_manner                       4
3    ENTY_animal                       6
4       ABBR_exp                       1


In [10]:
train_set.head()

Unnamed: 0,question,label,processed_question,category,specific_type,category_encoded,specific_type_encoded,combined_label,combined_label_encoded
0,How did serfdom develop in and then leave Russ...,DESC:manner,serfdom develop leave russia,DESC,manner,1,23,DESC_manner,4
1,What films featured the character Popeye Doyle ?,ENTY:cremat,film featured character popeye doyle,ENTY,cremat,2,8,ENTY_cremat,9
2,How can I find a list of celebrities ' real na...,DESC:manner,find list celebrity real name,DESC,manner,1,23,DESC_manner,4
3,What fowl grabs the spotlight after the Chines...,ENTY:animal,fowl grab spotlight chinese year monkey,ENTY,animal,2,1,ENTY_animal,6
4,What is the full form of .com ?,ABBR:exp,full form com,ABBR,exp,0,16,ABBR_exp,1


In [11]:
print(train_set['category'].value_counts())
print(train_set['specific_type'].value_counts())


category
ENTY    1250
HUM     1223
DESC    1162
NUM      896
LOC      835
ABBR      86
Name: count, dtype: int64
specific_type
ind          962
other        733
def          421
count        363
desc         321
manner       276
date         218
cremat       207
reason       191
gr           189
country      155
city         129
animal       112
food         103
dismed       103
termeq        93
period        75
money         71
exp           70
state         66
sport         62
event         56
product       42
substance     41
color         40
techmeth      38
dist          34
veh           27
perc          27
word          26
title         25
mount         21
body          16
abb           16
lang          16
plant         13
volsize       13
weight        11
symbol        11
instru        10
letter         9
code           9
speed          9
temp           8
ord            6
religion       4
currency       4
Name: count, dtype: int64


## Observations
1. **Categories**
- The ENTY (Entity), HUM (Human), and DESC (Description) categories dominate the dataset.
- ABBR (Abbreviation) is severely underrepresented with only 86 instances, making it a minority class.

2. **Specific Types**
The specific_type distribution is quite imbalanced:
- ind, other, and def are the most common types.
- Some specific types, like currency, religion, ord, and temp, have fewer than 10 instances.

## Challenges
1. **Class Imbalance:**
Both category and specific_type have significant imbalances.
Minority classes may lead to poor model performance for those classes.

2. **Granularity:**
Some specific types, such as techmeth and volsize, are too granular, which may increase the complexity of classification.

In [36]:
print(f"Category classes: {category_encoder.classes_}")
print(f"Specific type classes: {specific_type_encoder.classes_}")
print(f"Combined label classes: {combined_label_encoder.classes_}")

Category classes: ['ABBR' 'DESC' 'ENTY' 'HUM' 'LOC' 'NUM']
Specific type classes: ['abb' 'animal' 'body' 'city' 'code' 'color' 'count' 'country' 'cremat'
 'currency' 'date' 'def' 'desc' 'dismed' 'dist' 'event' 'exp' 'food' 'gr'
 'ind' 'instru' 'lang' 'letter' 'manner' 'money' 'mount' 'ord' 'other'
 'perc' 'period' 'plant' 'product' 'reason' 'religion' 'speed' 'sport'
 'state' 'substance' 'symbol' 'techmeth' 'temp' 'termeq' 'title' 'veh'
 'volsize' 'weight' 'word']
Combined label classes: ['ABBR_abb' 'ABBR_exp' 'DESC_def' 'DESC_desc' 'DESC_manner' 'DESC_reason'
 'ENTY_animal' 'ENTY_body' 'ENTY_color' 'ENTY_cremat' 'ENTY_currency'
 'ENTY_dismed' 'ENTY_event' 'ENTY_food' 'ENTY_instru' 'ENTY_lang'
 'ENTY_letter' 'ENTY_other' 'ENTY_plant' 'ENTY_product' 'ENTY_religion'
 'ENTY_sport' 'ENTY_substance' 'ENTY_symbol' 'ENTY_techmeth' 'ENTY_termeq'
 'ENTY_veh' 'ENTY_word' 'HUM_desc' 'HUM_gr' 'HUM_ind' 'HUM_title'
 'LOC_city' 'LOC_country' 'LOC_mount' 'LOC_other' 'LOC_state' 'NUM_code'
 'NUM_count

In [37]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the Tokenizer and fit it on your training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_set['processed_question'])  

# Convert the text data into sequences
X_train_sequences = tokenizer.texts_to_sequences(train_set['processed_question'])
X_test_sequences = tokenizer.texts_to_sequences(test_set['processed_question'])

# Get the maximum sequence length (for padding purposes)
max_sequence_length = max([len(seq) for seq in X_train_sequences])

# Pad the sequences to make them all the same length
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')


In [38]:
# Extract the labels
y_train = train_set['combined_label_encoded']
y_test = test_set['combined_label_encoded']


In [39]:
print(X_train_padded.shape)  # Should print (num_train_samples, max_sequence_length)
print(X_test_padded.shape)   # Should print (num_test_samples, max_sequence_length)
print(y_train.shape)         # Should print (num_train_samples,)
print(y_test.shape)          # Should print (num_test_samples,)


(5452, 18)
(500, 18)
(5452,)
(500,)


In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import numpy as np
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=X_train_padded.shape[1]))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(np.unique(y_train)), activation='softmax'))  # Multi-class classification




In [41]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [43]:
history = model.fit(
    X_train_padded, 
    y_train, 
    epochs=20, 
    batch_size=32, 
    validation_data=(X_test_padded, y_test)
)


Epoch 1/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.6659 - loss: 1.1199 - val_accuracy: 0.4220 - val_loss: 3.0255
Epoch 2/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.7251 - loss: 0.9628 - val_accuracy: 0.4000 - val_loss: 3.0658
Epoch 3/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.7532 - loss: 0.8297 - val_accuracy: 0.4320 - val_loss: 3.0505
Epoch 4/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8040 - loss: 0.7116 - val_accuracy: 0.4360 - val_loss: 3.1212
Epoch 5/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8367 - loss: 0.6110 - val_accuracy: 0.4260 - val_loss: 3.2029
Epoch 6/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8619 - loss: 0.5025 - val_accuracy: 0.4560 - val_loss: 3.0820
Epoch 7/20
[1m171/171

In [44]:
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {test_accuracy}")


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4549 - loss: 4.0337
Test Accuracy: 0.4880000054836273
