In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords , wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            # Split label and question
            label, question = line.strip().split(' ', 1)
            data.append({"question": question, "label": label})
    return pd.DataFrame(data)

train_set = load_data(".\data\\train_set5.txt")
test_set = load_data(".\data\\test_set.txt")

print(train_set.head())

                                            question        label
0  How did serfdom develop in and then leave Russ...  DESC:manner
1   What films featured the character Popeye Doyle ?  ENTY:cremat
2  How can I find a list of celebrities ' real na...  DESC:manner
3  What fowl grabs the spotlight after the Chines...  ENTY:animal
4                    What is the full form of .com ?     ABBR:exp


In [8]:
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

train_set['processed_question'] = train_set['question'].apply(preprocess_text)
test_set['processed_question'] = test_set['question'].apply(preprocess_text)

In [9]:
train_set['processed_question'].head()

0               serfdom develop leave russia
1       film featured character popeye doyle
2              find list celebrity real name
3    fowl grab spotlight chinese year monkey
4                              full form com
Name: processed_question, dtype: object

In [10]:
train_set[['category', 'specific_type']] = train_set['label'].str.split(':', expand=True)
test_set[['category', 'specific_type']] = test_set['label'].str.split(':', expand=True)

print(train_set[['label', 'category', 'specific_type']].head())

         label category specific_type
0  DESC:manner     DESC        manner
1  ENTY:cremat     ENTY        cremat
2  DESC:manner     DESC        manner
3  ENTY:animal     ENTY        animal
4     ABBR:exp     ABBR           exp


In [11]:
category_encoder = LabelEncoder()
specific_type_encoder = LabelEncoder()

train_set['category_encoded'] = category_encoder.fit_transform(train_set['category'])
train_set['specific_type_encoded'] = specific_type_encoder.fit_transform(train_set['specific_type'])

test_set['category_encoded'] = category_encoder.transform(test_set['category'])
test_set['specific_type_encoded'] = specific_type_encoder.transform(test_set['specific_type'])

print(train_set[['category', 'specific_type', 'category_encoded', 'specific_type_encoded']].head())

  category specific_type  category_encoded  specific_type_encoded
0     DESC        manner                 1                     23
1     ENTY        cremat                 2                      8
2     DESC        manner                 1                     23
3     ENTY        animal                 2                      1
4     ABBR           exp                 0                     16


In [12]:
train_set['combined_label'] = train_set['category'] + "_" + train_set['specific_type']
test_set['combined_label'] = test_set['category'] + "_" + test_set['specific_type']

combined_label_encoder = LabelEncoder()
train_set['combined_label_encoded'] = combined_label_encoder.fit_transform(train_set['combined_label'])
test_set['combined_label_encoded'] = combined_label_encoder.transform(test_set['combined_label'])

print(train_set[['combined_label', 'combined_label_encoded']].head())

  combined_label  combined_label_encoded
0    DESC_manner                       4
1    ENTY_cremat                       9
2    DESC_manner                       4
3    ENTY_animal                       6
4       ABBR_exp                       1


In [13]:
train_set.head()

Unnamed: 0,question,label,processed_question,category,specific_type,category_encoded,specific_type_encoded,combined_label,combined_label_encoded
0,How did serfdom develop in and then leave Russ...,DESC:manner,serfdom develop leave russia,DESC,manner,1,23,DESC_manner,4
1,What films featured the character Popeye Doyle ?,ENTY:cremat,film featured character popeye doyle,ENTY,cremat,2,8,ENTY_cremat,9
2,How can I find a list of celebrities ' real na...,DESC:manner,find list celebrity real name,DESC,manner,1,23,DESC_manner,4
3,What fowl grabs the spotlight after the Chines...,ENTY:animal,fowl grab spotlight chinese year monkey,ENTY,animal,2,1,ENTY_animal,6
4,What is the full form of .com ?,ABBR:exp,full form com,ABBR,exp,0,16,ABBR_exp,1


In [14]:
print(train_set['category'].value_counts())
print(train_set['specific_type'].value_counts())

category
ENTY    1250
HUM     1223
DESC    1162
NUM      896
LOC      835
ABBR      86
Name: count, dtype: int64
specific_type
ind          962
other        733
def          421
count        363
desc         321
manner       276
date         218
cremat       207
reason       191
gr           189
country      155
city         129
animal       112
food         103
dismed       103
termeq        93
period        75
money         71
exp           70
state         66
sport         62
event         56
product       42
substance     41
color         40
techmeth      38
dist          34
veh           27
perc          27
word          26
title         25
mount         21
body          16
abb           16
lang          16
plant         13
volsize       13
weight        11
symbol        11
instru        10
letter         9
code           9
speed          9
temp           8
ord            6
religion       4
currency       4
Name: count, dtype: int64


In [25]:
# 2. Feature Extraction
# Bag-of-Words representation
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(train_set.question)
X_test_bow = vectorizer.transform(test_set.question)

# Test Bag-of-Words
print("BOW Train Feature Shape:", X_train_bow.shape)
print("BOW Test Feature Shape:", X_test_bow.shape)

# WordNet Synsets as Features
def wordnet_features(questions):
    features = []
    for question in questions:
        tokens = nltk.word_tokenize(question)
        synset_features = []
        for token in tokens:
            synsets = wordnet.synsets(token)
            # Use the first lemma name of the first synset if available, else use the token
            synset_features.append(synsets[0].lemma_names()[0] if synsets else token)
        features.append(" ".join(synset_features))
    return features

# Test WordNet Synsets
X_train_synsets = vectorizer.fit_transform(wordnet_features(train_set.question))
X_test_synsets = vectorizer.transform(wordnet_features(test_set.question))
print("WordNet Synsets Train Feature Shape:", X_train_synsets.shape)
print("WordNet Synsets Test Feature Shape:", X_test_synsets.shape)

BOW Train Feature Shape: (5452, 8411)
BOW Test Feature Shape: (500, 8411)
WordNet Synsets Train Feature Shape: (5452, 7074)
WordNet Synsets Test Feature Shape: (500, 7074)


In [26]:
# Combine Features (Bag-of-Words + Synsets)
X_train_combined = np.hstack([X_train_bow.toarray(), X_train_synsets.toarray()])
X_test_combined = np.hstack([X_test_bow.toarray(), X_test_synsets.toarray()])

# Test combined features
print("Combined Train Feature Shape:", X_train_combined.shape)
print("Combined Test Feature Shape:", X_test_combined.shape)


Combined Train Feature Shape: (5452, 15485)
Combined Test Feature Shape: (500, 15485)


In [19]:
# Modeling with SVM
y_train = train_set['combined_label_encoded']
y_test = test_set['combined_label_encoded']
# Train the model using the combined labels
svm_model = SVC(kernel='linear',random_state=42)
svm_model.fit(X_train_combined,y_train)


In [27]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score,classification_report
# Predictions
y_pred = svm_model.predict(X_test_combined)
# Generate the classification report
print("Test Set Classification Report:")
print(classification_report(y_test, y_pred))


Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.86      0.75      0.80         8
           2       0.77      1.00      0.87       123
           3       0.56      0.71      0.63         7
           4       0.67      1.00      0.80         2
           5       1.00      1.00      1.00         6
           6       1.00      0.69      0.81        16
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00        10
           9       0.00      0.00      0.00         0
          10       1.00      0.50      0.67         6
          11       1.00      0.50      0.67         2
          12       0.00      0.00      0.00         2
          13       1.00      0.25      0.40         4
          14       1.00      1.00      1.00         1
          15       1.00      1.00      1.00         2
          17       0.33      0.42      0.37      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
print(len(y_pred))
print(len(y_test))

5452
500


In [31]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Training Word2Vec model
tokenized_text = train_set['processed_question'].tolist()
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Function to get the average Word2Vec embedding for a question
def get_word2vec_embedding(sentence, model):
    embeddings = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

# Apply the function to both train and test sets
train_set['word2vec'] = train_set['processed_question'].apply(lambda x: get_word2vec_embedding(x, word2vec_model))
test_set['word2vec'] = test_set['processed_question'].apply(lambda x: get_word2vec_embedding(x, word2vec_model))

# Prepare the features and target variable
X_train = np.vstack(train_set['word2vec'])
X_test = np.vstack(test_set['word2vec'])
y_train = train_set['combined_label_encoded']
y_test = test_set['combined_label_encoded']

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5452, 100)
(500, 100)
(5452,)
(500,)


In [32]:
# Train the model using the combined labels
svm_model = SVC(kernel='linear',random_state=42)
svm_model.fit(X_train,y_train)


In [33]:
# Predictions
y_pred = svm_model.predict(X_test)
# Generate the classification report
print("Test Set Classification Report:")
print(classification_report(y_test, y_pred))

Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         8
           2       0.77      0.66      0.71       123
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         6
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00        10
          10       0.00      0.00      0.00         6
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         4
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          17       0.00      0.00      0.00        12
          18       0.00      0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
from gensim.models import KeyedVectors
# Load pretrained Word2Vec model
print("Loading pretrained Word2Vec model...")
w2v_model = KeyedVectors.load_word2vec_format(r"C:\Users\MSI\Downloads\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin", binary=True)
print("Word2Vec model loaded!")


Loading pretrained Word2Vec model...
Word2Vec model loaded!


In [37]:
# Function to compute Word2Vec embeddings for a text
def compute_w2v_embedding(words, model, vector_size=300):
    embeddings = [model[word] for word in words if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)  # Average embedding
    else:
        return np.zeros(vector_size)  # Return zero vector if no words match

In [38]:
# Generate embeddings for train and test sets
print("Generating Word2Vec embeddings...")
train_embeddings = np.array([compute_w2v_embedding(words, w2v_model) for words in train_set['processed_question']])
test_embeddings = np.array([compute_w2v_embedding(words, w2v_model) for words in test_set['processed_question']])
print("Embeddings generated!")

Generating Word2Vec embeddings...
Embeddings generated!


In [39]:
# Train SVM classifier
print("Training SVM model...")
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(train_embeddings, train_set['combined_label_encoded'])

Training SVM model...


In [40]:
# Predict on the test set
print("Making predictions...")
test_predictions = svm_model.predict(test_embeddings)

# Decode predictions
test_set['predicted_label'] = combined_label_encoder.inverse_transform(test_predictions)

# Evaluate the model
accuracy = np.mean(test_predictions == test_set['combined_label_encoded'])
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Display predictions
print(test_set[['question', 'label', 'predicted_label']].head())

Making predictions...
Test Accuracy: 15.00%
                                   question     label predicted_label
0      How far is it from Denver to Aspen ?  NUM:dist         HUM_ind
1  What county is Modesto , California in ?  LOC:city         HUM_ind
2                         Who was Galileo ?  HUM:desc         HUM_ind
3                         What is an atom ?  DESC:def        DESC_def
4          When did Hawaii become a state ?  NUM:date         HUM_ind
