In [1]:
import nltk
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.sparse import hstack
import numpy as np

In [2]:
# Preprocess the dataset
def dataPreprocessing(path):
    questions = []
    labels = []
    with open(path,'r',encoding='Latin_1') as data:
        for line in data:
            label, question = line.split(" ", 1)
            labels.append(label.split(":")[0])
            questions.append(question)
    return questions, labels

trainingSet, trainingLabels = dataPreprocessing('train_5500')
testingSet, testingLabels = dataPreprocessing('TREC_10')

In [4]:
# Combine training and test questions to ensure consistent TF-IDF features
fullDataset = trainingSet + testingSet

# Feature extraction: Length of the question, TF-IDF for word unigrams, and POS tag unigrams
unigramRanges = [(1, 500), (2, 300), (3, 200)]

def featureExtraction(questions, ngram_range, use_pos=False):
    # Length of the features
    featureLength = np.array([len(q) for q in questions])
    
    tfidfVectorizer = TfidfVectorizer(
        ngram_range=ngram_range,
        max_features=ngram_range[1],
        stop_words='english' if ngram_range[0] == 1 else None
    )
    tfidf_features = tfidfVectorizer.fit_transform(questions)

    if use_pos:
        pos_tags = [nltk.pos_tag(word_tokenize(question)) for question in questions]
        pos_tag_features = [" ".join(tag[1] for tag in tags) for tags in pos_tags]
        tfidf_pos_vectorizer = TfidfVectorizer(
            ngram_range=(1, 1),
            max_features=500
        )
        pos_features = tfidf_pos_vectorizer.fit_transform(pos_tag_features)
        
        # Combine all features
        return np.hstack((featureLength.reshape(-1, 1), tfidf_features.toarray(), pos_features.toarray()))
    else:
        return np.hstack((featureLength.reshape(-1, 1), tfidf_features.toarray()))

# Create feature vectors for training and test data
all_features = featureExtraction(fullDataset, (1, 500), use_pos=True)
train_features = all_features[:len(trainingSet)]
test_features = all_features[len(trainingSet):]

In [5]:
# Train a Decision Tree classifier
classifier = DecisionTreeClassifier()
classifier.fit(train_features, trainingLabels)

# Predict the labels for the test set
test_predictions = classifier.predict(test_features)

# Evaluate the classifier
accuracy = accuracy_score(testingLabels, test_predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 73.00%


In [6]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, f1_score
from sklearn.tree import DecisionTreeClassifier


# Create a Decision Tree classifier using the Gini index criterion
gini_classifier = DecisionTreeClassifier(criterion='gini')
# Train the classifier on the training data
gini_classifier.fit(train_features, trainingLabels)

# Create a Decision Tree classifier using the misclassification error criterion
error_classifier = DecisionTreeClassifier(criterion='entropy', splitter='random')
# Train the classifier on the training data
error_classifier.fit(train_features, trainingLabels)

# Create a Decision Tree classifier using the cross-entropy criterion
entropy_classifier = DecisionTreeClassifier(criterion='entropy')
# Train the classifier on the training data
entropy_classifier.fit(train_features, trainingLabels)

# # Perform 10-fold cross-validation and generate predictions
# gini_predictions = cross_val_predict(gini_classifier, train_features, trainingLabels, cv=10)
# error_predictions = cross_val_predict(error_classifier, train_features, trainingLabels, cv=10)
# entropy_predictions = cross_val_predict(entropy_classifier, train_features, trainingLabels, cv=10)
gini_predictions = gini_classifier.predict(test_features)
error_predictions = error_classifier.predict(test_features)
entropy_predictions = entropy_classifier.predict(test_features)

# Evaluate the models using classification reports
gini_report = classification_report(testingLabels, gini_predictions, target_names=np.unique(testingLabels))
error_report = classification_report(testingLabels, error_predictions, target_names=np.unique(testingLabels))
entropy_report = classification_report(testingLabels, entropy_predictions, target_names=np.unique(testingLabels))

# Calculate F1-scores
gini_f1 = f1_score(testingLabels, gini_predictions, average='weighted')
error_f1 = f1_score(testingLabels, error_predictions, average='weighted')
entropy_f1 = f1_score(testingLabels, entropy_predictions, average='weighted')

# Print the classification reports and F1-scores
print("Gini Index Criterion:")
print(gini_report)
print(f"F1-Score: {gini_f1:.2f}")

print("Misclassification Error Criterion:")
print(error_report)
print(f"F1-Score: {error_f1:.2f}")

print("Cross-Entropy Criterion:")
print(entropy_report)
print(f"F1-Score: {entropy_f1:.2f}")


Gini Index Criterion:
              precision    recall  f1-score   support

        ABBR       1.00      0.78      0.88         9
        DESC       0.75      0.94      0.84       138
        ENTY       0.58      0.59      0.58        94
         HUM       0.65      0.71      0.68        65
         LOC       0.72      0.69      0.70        81
         NUM       0.89      0.60      0.72       113

    accuracy                           0.72       500
   macro avg       0.77      0.72      0.73       500
weighted avg       0.74      0.72      0.72       500

F1-Score: 0.72
Misclassification Error Criterion:
              precision    recall  f1-score   support

        ABBR       0.88      0.78      0.82         9
        DESC       0.73      0.91      0.81       138
        ENTY       0.58      0.56      0.57        94
         HUM       0.64      0.72      0.68        65
         LOC       0.65      0.68      0.67        81
         NUM       0.87      0.55      0.67       113

    a

In [29]:
# Feature Ablation Study
features = ["Length", "TF-IDF Word Unigrams", "POS Tag Unigrams"]
original_f1 = gini_f1  # F1-score with all features

for i, feature_name in enumerate(features):
    # Create a copy of all features with the current feature removed
    ablated_features = np.delete(train_features, i, axis=1)
    
    # Perform 10-fold cross-validation using the ablated features
    ablated_predictions = cross_val_predict(gini_classifier, ablated_features, trainingLabels, cv=10)
    
    # Calculate F1-score for the ablated feature set
    ablated_f1 = f1_score(trainingLabels, ablated_predictions, average='weighted')
    
    # Calculate the change in F1-score
    f1_change = original_f1 - ablated_f1
    
    print(f"Feature Ablation Study - Feature: {feature_name}")
    print(f"F1-Score (without {feature_name}): {ablated_f1:.2f}")
    print(f"F1-Score Change: {f1_change:.2f}")

Feature Ablation Study - Feature: Length
F1-Score (without Length): 0.60
F1-Score Change: 0.01
Feature Ablation Study - Feature: TF-IDF Word Unigrams
F1-Score (without TF-IDF Word Unigrams): 0.61
F1-Score Change: 0.00
Feature Ablation Study - Feature: POS Tag Unigrams
F1-Score (without POS Tag Unigrams): 0.60
F1-Score Change: 0.01


In [42]:
# Error Propagation Analysis
def error_propagation_analysis(original_predictions, corrected_predictions):
    # Calculate the number of samples misclassified by the original model but correctly classified by the corrected model
    misclassified_original_corrected = sum((original_predictions != trainingLabels) & (corrected_predictions == trainingLabels))
    total_misclassified_original = sum(original_predictions != trainingLabels)
    if total_misclassified_original == 0:
        return 0.0  # Avoid division by zero
    return (misclassified_original_corrected / total_misclassified_original) * 100

# Analyze error propagation between models
gini_error_propagation = error_propagation_analysis(gini_predictions, error_predictions)
error_gini_propagation = error_propagation_analysis(error_predictions, gini_predictions)
entropy_error_propagation = error_propagation_analysis(entropy_predictions, error_predictions)
error_entropy_propagation = error_propagation_analysis(error_predictions, entropy_predictions)

print("Error Propagation Analysis:")
print(f"Gini to Error Propagation: {gini_error_propagation:.2f}%")
print(f"Error to Gini Propagation: {error_gini_propagation:.2f}%")
print(f"Entropy to Error Propagation: {entropy_error_propagation:.2f}%")
print(f"Error to Entropy Propagation: {error_entropy_propagation:.2f}%")

Error Propagation Analysis:
Gini to Error Propagation: 30.63%
Error to Gini Propagation: 31.23%
Entropy to Error Propagation: 33.53%
Error to Entropy Propagation: 28.98%


In [28]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Define the number of folds for cross-validation
num_folds = 10

# Create an instance of the DecisionTreeClassifier (or any other classifier you prefer)
classifier = DecisionTreeClassifier()

# Initialize lists to store evaluation metrics
precision_scores = []
recall_scores = []
f1_scores = []

# Create StratifiedKFold object to ensure class balance in each fold
stratified_kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform 10-fold cross-validation
for train_index, val_index in stratified_kfold.split(train_features, trainingLabels):
    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = train_features[train_index], train_features[val_index]
    y_train_fold, y_val_fold = np.array(trainingLabels)[train_index], np.array(trainingLabels)[val_index]

    # Train the classifier on the training data for this fold
    classifier.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation data for this fold
    y_pred = classifier.predict(X_val_fold)

    # Calculate precision, recall, and F-score for this fold
    precision = precision_score(y_val_fold, y_pred, average='weighted')
    recall = recall_score(y_val_fold, y_pred, average='weighted')
    f1 = f1_score(y_val_fold, y_pred, average='weighted')

    # Append the scores to the respective lists
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate the average scores over all folds
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)

# Print the average scores
print("Average Precision: {:.2f}".format(avg_precision))
print("Average Recall: {:.2f}" .format(avg_recall))
print("Average F1-score: {:.2f}" .format(avg_f1))


Average Precision: 0.61
Average Recall: 0.60
Average F1-score: 0.60


In [35]:
for p, r, f1 in zip(precision_scores, recall_scores, f1_scores):
    print(f"Precision: {p:.2f} Recall: {r:.2f} F1_score: {f1:.2f}")


Precision: 0.61 Recall: 0.61 F1_score: 0.61
Precision: 0.63 Recall: 0.62 F1_score: 0.62
Precision: 0.59 Recall: 0.59 F1_score: 0.59
Precision: 0.58 Recall: 0.58 F1_score: 0.58
Precision: 0.61 Recall: 0.60 F1_score: 0.60
Precision: 0.61 Recall: 0.60 F1_score: 0.60
Precision: 0.57 Recall: 0.57 F1_score: 0.57
Precision: 0.62 Recall: 0.61 F1_score: 0.62
Precision: 0.62 Recall: 0.62 F1_score: 0.62
Precision: 0.64 Recall: 0.63 F1_score: 0.63
