In [None]:
import nltk

nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [None]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features[f'contains({word.lower()})'] = True
    return features

def generate_binary_feature(label):
    return label in ['whQuestion', 'yAnswer','ynQuestion']

featuresets = [(dialogue_act_features(post.text), generate_binary_feature(post.get('class'))) for post in posts]

# 10% of the total data
size = int(len(featuresets) * 0.2)

# first 10% for test_set to check the accuracy, and rest 90% after the first 10% for training
train_set, test_set = featuresets[size:], featuresets[:size]

# get the classifier from the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

# to check the accuracy
print("Accuracy:", nltk.classify.accuracy(classifier, test_set))


In [None]:
# Now, classify a new sentence
sentences = [
    "Can I ask what you meant at 5:30",
    "Do you know the answer to that",
    "Shouldn't it be the case we have answer for this",
    "Does it make sense that it comes to this",
    "How long does it take to get the code set up",
    "Where can I find the rest of the series",
    "I dont understand why this should work",
    "I am a little confused at the end what the creator was saying",
    
    "Thank you",
    "Thank you for creating this video",
    "By far this is the best video on yt",
    "This video sucks",
    "What a joke",
    "I never liked this",
    "Super helpful",
    "I have been teaching all my life and this video does better than what I have done",
]
for sentence in sentences:
    features = dialogue_act_features(sentence)
    predicted_label = classifier.classify(features)
    print("Sentence: ", sentence)
    print("Predicted label:", predicted_label)

In [3]:
from collections import Counter

def get_word_frequencies(posts):
    """Count word frequencies across all posts."""
    word_freq = Counter()
    for post in posts:
        words = nltk.word_tokenize(post.text.lower())
        word_freq.update(words)
    return word_freq

def dialogue_act_features(post, word_freq, min_freq):
    """Extract features from post, excluding rare words."""
    features = {}
    for word in nltk.word_tokenize(post.text):
        word_lower = word.lower()
        if word_freq[word_lower] >= min_freq:
            features[f'contains({word_lower})'] = True
    return features

def generate_binary_feature(label):
    """Generate binary classification for questions."""
    return label in ['whQuestion', 'yAnswer', 'ynQuestion']

def train_question_classifier(min_freq=1):
    # Calculate word frequencies across all posts
    word_freq = get_word_frequencies(posts)
    
    # Generate feature sets with frequency filtering
    featuresets = [
        (dialogue_act_features(post, word_freq, min_freq),
         generate_binary_feature(post.get('class')))
        for post in posts
    ]
    
    # Split into train and test sets
    size = int(len(featuresets) * 0.2)
    train_set, test_set = featuresets[size:], featuresets[:size]
    
    # Train the classifier
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    # Calculate and return accuracy
    accuracy = nltk.classify.accuracy(classifier, test_set)
    
    return classifier, accuracy, word_freq

# Example usage
min_freq = 4  # Remove words that appear less than 5 times
classifier, accuracy, word_freq = train_question_classifier(min_freq)
print(f"Accuracy with minimum frequency {min_freq}:", accuracy)

# Print vocabulary size before and after filtering
total_vocab = len(word_freq)
filtered_vocab = len([word for word, freq in word_freq.items() if freq >= min_freq])
print(f"Vocabulary size before filtering: {total_vocab}")
print(f"Vocabulary size after filtering (freq >= {min_freq}): {filtered_vocab}")

Accuracy with minimum frequency 4: 0.862
Vocabulary size before filtering: 5644
Vocabulary size after filtering (freq >= 4): 1331


In [4]:
# Now, classify a new sentence

# Need this to keep original feature set up when testing
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features[f'contains({word.lower()})'] = True
    return features

sentences = [
    "Can I ask what you meant at 5:30",
    "Do you know the answer to that",
    "Shouldn't it be the case we have answer for this",
    "Does it make sense that it comes to this",
    "How long does it take to get the code set up",
    "Where can I find the rest of the series",
    "I dont understand why this should work",
    "I am a little confused at the end what the creator was saying",
    
    "Thank you",
    "Thank you for creating this video",
    "By far this is the best video on yt",
    "This video sucks",
    "What a joke",
    "I never liked this",
    "Super helpful",
    "I have been teaching all my life and this video does better than what I have done",
]
for sentence in sentences:
    features = dialogue_act_features(sentence)
    predicted_label = classifier.classify(features)
    print("Sentence: ", sentence)
    print("Predicted label:", predicted_label)

Sentence:  Can I ask what you meant at 5:30
Predicted label: True
Sentence:  Do you know the answer to that
Predicted label: True
Sentence:  Shouldn't it be the case we have answer for this
Predicted label: True
Sentence:  Does it make sense that it comes to this
Predicted label: True
Sentence:  How long does it take to get the code set up
Predicted label: True
Sentence:  Where can I find the rest of the series
Predicted label: True
Sentence:  I dont understand why this should work
Predicted label: True
Sentence:  I am a little confused at the end what the creator was saying
Predicted label: True
Sentence:  Thank you
Predicted label: False
Sentence:  Thank you for creating this video
Predicted label: False
Sentence:  By far this is the best video on yt
Predicted label: False
Sentence:  This video sucks
Predicted label: False
Sentence:  What a joke
Predicted label: True
Sentence:  I never liked this
Predicted label: False
Sentence:  Super helpful
Predicted label: False
Sentence:  I have

In [5]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np

# Download required data
nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

class TfidfQuestionClassifier:
    def __init__(self, max_features=1000, min_df=2):
        """
        Initialize the TF-IDF based question classifier
        
        Parameters:
        - max_features: Maximum number of features to use (top words by tf-idf)
        - min_df: Minimum document frequency for a word to be included
        """
        self.pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,
                min_df=min_df,
                lowercase=True,
                strip_accents='unicode',
                analyzer='word'
            )),
            ('classifier', MultinomialNB())
        ])
        
        # Prepare training data
        self.train_classifier()
        
    def train_classifier(self):
        """Train the classifier on the NPS chat corpus"""
        # Extract texts and labels
        texts = [post.text for post in posts]
        labels = [self.is_question(post.get('class')) for post in posts]
        
        # Split into train and test sets
        split_idx = int(len(texts) * 0.2)
        train_texts = texts[split_idx:]
        train_labels = labels[split_idx:]
        test_texts = texts[:split_idx]
        test_labels = labels[:split_idx]
        
        # Train the pipeline
        self.pipeline.fit(train_texts, train_labels)
        
        # Calculate accuracy
        self.accuracy = self.pipeline.score(test_texts, test_labels)
        print(f"Classifier trained with accuracy: {self.accuracy:.2f}")
        
        # Store vocabulary for feature inspection
        self.vocabulary = self.pipeline.named_steps['tfidf'].vocabulary_
        
    @staticmethod
    def is_question(label):
        """Convert NPS chat labels to binary question/non-question"""
        return label in ['whQuestion', 'yAnswer', 'ynQuestion']
    
    def classify_sentence(self, sentence):
        """
        Classify a single sentence
        
        Returns a dictionary with classification results and feature importance
        """
        # Get prediction and probability
        is_question = self.pipeline.predict([sentence])[0]
        proba = self.pipeline.predict_proba([sentence])[0]
        
        # Get feature importance
        tfidf_matrix = self.pipeline.named_steps['tfidf'].transform([sentence])
        feature_names = self.pipeline.named_steps['tfidf'].get_feature_names_out()
        
        # Get non-zero features and their values
        nonzero_indices = tfidf_matrix.nonzero()[1]
        tfidf_scores = zip(nonzero_indices, tfidf_matrix.data)
        important_features = [
            (feature_names[idx], score)
            for idx, score in sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
        ]
        
        return {
            'is_question': bool(is_question),
            'probability': float(max(proba)),  # Probability of predicted class
            'important_features': important_features
        }
    
    def get_most_important_features(self, n=10):
        """Return the n most important features based on their IDF scores"""
        idf = self.pipeline.named_steps['tfidf'].idf_
        feature_names = self.pipeline.named_steps['tfidf'].get_feature_names_out()
        
        # Sort features by IDF score
        important_features = sorted(zip(feature_names, idf), key=lambda x: x[1], reverse=True)
        return important_features[:n]

# Example usage
if __name__ == "__main__":
    # Initialize classifier
    classifier = TfidfQuestionClassifier(max_features=1000, min_df=2)
    
    # Print most important features overall
    print("\nMost important features by IDF score:")
    for feature, score in classifier.get_most_important_features(10):
        print(f"{feature}: {score:.3f}")
    
    # Test some example sentences
    test_sentences = [
        "What time is it?",
        "How does this work?",
        "This is a statement.",
        "Can you help me?",
        "The weather is nice today.",
    ]
    
    print("\nClassifying example sentences:")
    for sentence in test_sentences:
        result = classifier.classify_sentence(sentence)
        print(f"\nSentence: {sentence}")
        print(f"Is question: {result['is_question']}")
        print(f"Confidence: {result['probability']:.2f}")
        print("Top features:")
        for feature, score in result['important_features'][:3]:
            print(f"  - {feature}: {score:.3f}")

[nltk_data] Downloading package nps_chat to
[nltk_data]     /Users/seungwonlim/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!


Classifier trained with accuracy: 0.90

Most important features by IDF score:
20suser196: 8.889
20suser219: 8.889
oi: 8.889
rolling: 8.889
sissy_76: 8.889
teensuser7: 8.889
40suser0: 8.601
blah: 8.601
die: 8.601
dum: 8.601

Classifying example sentences:

Sentence: What time is it?
Is question: False
Confidence: 0.67
Top features:
  - time: 0.614
  - what: 0.492
  - it: 0.441

Sentence: How does this work?
Is question: True
Confidence: 0.53
Top features:
  - does: 0.545
  - work: 0.542
  - this: 0.473

Sentence: This is a statement.
Is question: False
Confidence: 0.85
Top features:
  - this: 0.799
  - is: 0.602

Sentence: Can you help me?
Is question: False
Confidence: 0.72
Top features:
  - help: 0.698
  - can: 0.490
  - me: 0.392

Sentence: The weather is nice today.
Is question: False
Confidence: 0.83
Top features:
  - weather: 0.587
  - today: 0.500
  - nice: 0.471


In [6]:
# Initialize
classifier = TfidfQuestionClassifier(max_features=1000, min_df=2)

# Classify a single sentence
result = classifier.classify_sentence("Is this a question?")
print(result['is_question'])  # True/False
print(result['probability'])  # Confidence score
print(result['important_features'])  # Most important words

# See globally important features
important_features = classifier.get_most_important_features(n=10)

Classifier trained with accuracy: 0.90
False
0.8059074244288816
[('question', np.float64(0.7049749674369392)), ('this', np.float64(0.5665088511428753)), ('is', np.float64(0.42670600753219534))]


# Checking content of data

In [None]:
import nltk

# Ensure required NLTK data is downloaded
nltk.download('nps_chat')
nltk.download('punkt')

# Load the NPS chat dataset
posts = nltk.corpus.nps_chat.xml_posts()[:100000]

# Filter out the posts that are classified as questions
question_posts = [post for post in posts if post.get('class') in ['whQuestion', 'ynQuestion']]

# Print the total number of question posts
print(f"Total question posts: {len(question_posts)}")

# Optionally, view some of the question posts
for post in question_posts[:100]:
    print(post.text)


In [None]:
posts[5].text

In [None]:
posts[0].get('class')