# Question Classification Notebook
@ author: Hatem Trigui


In [4]:
import pandas as pd

# Function to load the data from a .txt file
def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            # Split label and question
            label, question = line.strip().split(' ', 1)
            data.append({"question": question, "label": label})
    return pd.DataFrame(data)

# Load your training and test set
train_set = load_data(".\data\\train_set5.txt")
test_set = load_data(".\data\\test_set.txt")

# Sample loaded data
print(train_set.head())


                                            question        label
0  How did serfdom develop in and then leave Russ...  DESC:manner
1   What films featured the character Popeye Doyle ?  ENTY:cremat
2  How can I find a list of celebrities ' real na...  DESC:manner
3  What fowl grabs the spotlight after the Chines...  ENTY:animal
4                    What is the full form of .com ?     ABBR:exp


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 1. Convert text to lowercase
    text = text.lower()
    
    # 2. Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. Tokenize the text into words
    words = word_tokenize(text)
    
    # 4. Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # 5. Lemmatization (more accurate than stemming)
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # 6. Reconstruct the text (optional if you want to keep the words as they are)
    return ' '.join(words)

# Apply the preprocessing to the entire dataset
train_set['processed_question'] = train_set['question'].apply(preprocess_text)
test_set['processed_question'] = test_set['question'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\la7tim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\la7tim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\la7tim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# Split the label into category and specific type
train_set[['category', 'specific_type']] = train_set['label'].str.split(':', expand=True)
test_set[['category', 'specific_type']] = test_set['label'].str.split(':', expand=True)

# Display a sample of the data
print(train_set[['label', 'category', 'specific_type']].head())


         label category specific_type
0  DESC:manner     DESC        manner
1  ENTY:cremat     ENTY        cremat
2  DESC:manner     DESC        manner
3  ENTY:animal     ENTY        animal
4     ABBR:exp     ABBR           exp


In [9]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder for category and specific type
category_encoder = LabelEncoder()
specific_type_encoder = LabelEncoder()

# Fit and transform the training labels
train_set['category_encoded'] = category_encoder.fit_transform(train_set['category'])
train_set['specific_type_encoded'] = specific_type_encoder.fit_transform(train_set['specific_type'])

# Transform the test labels
test_set['category_encoded'] = category_encoder.transform(test_set['category'])
test_set['specific_type_encoded'] = specific_type_encoder.transform(test_set['specific_type'])

# Check the encoding
print(train_set[['category', 'specific_type', 'category_encoded', 'specific_type_encoded']].head())


  category specific_type  category_encoded  specific_type_encoded
0     DESC        manner                 1                     23
1     ENTY        cremat                 2                      8
2     DESC        manner                 1                     23
3     ENTY        animal                 2                      1
4     ABBR           exp                 0                     16


In [10]:
# Combine category and specific type into a single label
train_set['combined_label'] = train_set['category'] + "_" + train_set['specific_type']
test_set['combined_label'] = test_set['category'] + "_" + test_set['specific_type']

# Encode the combined label
combined_label_encoder = LabelEncoder()
train_set['combined_label_encoded'] = combined_label_encoder.fit_transform(train_set['combined_label'])
test_set['combined_label_encoded'] = combined_label_encoder.transform(test_set['combined_label'])

# Check the encoded combined labels
print(train_set[['combined_label', 'combined_label_encoded']].head())


  combined_label  combined_label_encoded
0    DESC_manner                       4
1    ENTY_cremat                       9
2    DESC_manner                       4
3    ENTY_animal                       6
4       ABBR_exp                       1


In [11]:
# Features (X) are the questions (text)
X_train = train_set['question']  # Replace 'question' with your actual column name for the questions
X_test = test_set['question']  # Same for test set

# Labels (y) are the encoded categories and specific types
y_train_category = train_set['category_encoded']
y_train_type = train_set['specific_type_encoded']

y_test_category = test_set['category_encoded']
y_test_type = test_set['specific_type_encoded']


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Check the shape of the transformed data
print(f"X_train_tfidf shape: {X_train_tfidf.shape}")


X_train_tfidf shape: (5452, 8182)


In [13]:
from sklearn.model_selection import train_test_split

# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split_category, y_val_split_category = train_test_split(
    X_train_tfidf, y_train_category, test_size=0.2, random_state=42)

X_train_split, X_val_split, y_train_split_type, y_val_split_type = train_test_split(
    X_train_tfidf, y_train_type, test_size=0.2, random_state=42)


In [22]:
print(X_train_split.shape)
print(X_val_split.shape)


(5452,)
(1091, 8182)


In [23]:
# Apply the TF-IDF transformation to the full training set before splitting
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Now, split the data into training and validation sets from the transformed data
X_train_split, X_val_split, y_train_split_category, y_val_split_category = train_test_split(
    X_train_tfidf, y_train_category, test_size=0.2, random_state=42)

X_train_split, X_val_split, y_train_split_type, y_val_split_type = train_test_split(
    X_train_tfidf, y_train_type, test_size=0.2, random_state=42)


In [24]:
print(X_train_tfidf.shape)  # Should print something like (number_of_samples, number_of_features)
print(X_test_tfidf.shape)


(5452, 8182)
(500, 8182)


In [25]:
from sklearn.model_selection import train_test_split

# Split the training data into training and validation sets for the category prediction
X_train_split, X_val_split, y_train_split_category, y_val_split_category = train_test_split(
    X_train_tfidf, y_train_category, test_size=0.2, random_state=42)

# Split the training data into training and validation sets for the type prediction
X_train_split, X_val_split, y_train_split_type, y_val_split_type = train_test_split(
    X_train_tfidf, y_train_type, test_size=0.2, random_state=42)


In [26]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression model
category_model = LogisticRegression(max_iter=1000)

# Train the model for category prediction
category_model.fit(X_train_split, y_train_split_category)

# Predict on the validation set
y_val_pred_category = category_model.predict(X_val_split)

# Evaluate the performance on the validation set
from sklearn.metrics import accuracy_score, classification_report

# Accuracy and classification report for category prediction
print("Category Prediction - Accuracy:", accuracy_score(y_val_split_category, y_val_pred_category))
print("Category Prediction - Classification Report:")
print(classification_report(y_val_split_category, y_val_pred_category))


Category Prediction - Accuracy: 0.6599450045829515
Category Prediction - Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.52      0.69        23
           1       0.57      0.67      0.62       227
           2       0.61      0.64      0.62       246
           3       0.65      0.72      0.68       240
           4       0.80      0.69      0.74       166
           5       0.76      0.60      0.67       189

    accuracy                           0.66      1091
   macro avg       0.73      0.64      0.67      1091
weighted avg       0.67      0.66      0.66      1091

