In [None]:
import nltk
nltk.download('punkt')

In [None]:
import spacy
import re

def is_question(text):
    """
    Determines if the input text contains a question.
    
    Args:
        text (str): Input text to analyze
        
    Returns:
        bool: True if text contains a question, False otherwise
    """
    # Load spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    # Preprocessing
    text = text.strip()
    
    # Early return if text ends with question mark
    if text.endswith('?'):
        return True
    
    # Process text with spaCy
    doc = nlp(text)
    
    # List of question-indicating words
    question_words = {'what', 'when', 'where', 'which', 'who', 'whom', 'whose', 'why', 'how'}
    
    # List of verbs that often indicate questions
    question_verbs = {'understand', 'get', 'know', 'tell', 'explain'}
    
    # Check for question patterns
    for sent in doc.sents:
        # Convert sentence to lowercase for matching
        sent_lower = sent.text.lower()
        
        # Pattern 1: Starts with question word
        if sent[0].text.lower() in question_words:
            return True
            
        # Pattern 2: Starts with auxiliary verb or modal verb
        if sent[0].pos_ in ['AUX', 'VERB'] and sent[0].dep_ == 'ROOT':
            return True
            
        # Pattern 3: Contains "can you" or "could you"
        if re.search(r'\b(can|could)\s+you\b', sent_lower):
            return True
            
        # Pattern 4: Negative statements indicating confusion/questions
        if any(word in sent_lower for word in ["don't understand", "don't get", "not sure"]):
            return True
            
        # Pattern 5: Check for question verbs in negative context
        for token in sent:
            if token.text.lower() in question_verbs:
                # Look for preceding negation
                for child in token.children:
                    if child.dep_ == 'neg':
                        return True
    
    return False


# interrogative sentence: "Can you..."



# Test cases
test_sentences = [
    '''Thank you so much man, this is awesome content''',  # False
    '''Can you make another video like this.''',  # True
    '''Can I ask you this, make another video like this!''',  # True
    '''What a beautiful video.''',  # False
    '''What were you saying at 5:13 mark''',  # True
    '''Where can I find more material like this''',  # True
    '''I don't get when you said this''',  # True
    '''What were you saying at 5:13 mark''',  # True
    '''I don't understand this part that says''',  # True
    '''I really don't get why people are saying bad things about this video'''  # True
]

# Run tests
for i, sentence in enumerate(test_sentences, 1):
    result = is_question(sentence)
    print(f"Sentence {i}: {result}")

In [7]:
import re
from typing import Dict, List, Optional
import nltk
from nltk.tokenize import word_tokenize
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

class QuestionClassifier:
    def __init__(self):
        # Common patterns and keywords for different question types
        self.wh_words = {'what', 'where', 'when', 'who', 'whom', 'whose', 'which', 'why', 'how'}
        self.auxiliaries = {'am', 'is', 'are', 'was', 'were', 'do', 'does', 'did', 'have', 'has', 'had', 
                          'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must'}
        self.tag_patterns = [r".*,\s*(?:isn't|ain't|aren't|wasn't|weren't|don't|doesn't|didn't|haven't|hasn't|hadn't)\s*(?:it|he|she|you|they|we|i)\??$"]
        self.permission_words = {'could', 'would', 'may', 'might', 'can'}
        self.leading_phrases = {'don\'t you think', 'wouldn\'t you say', 'surely'}
        self.confirmation_phrases = {'is that right', 'do you mean', 'so what you\'re saying'}
        self.rhetorical_patterns = {'how could anyone', 'who wouldn\'t', 'isn\'t it obvious'}
        self.embedded_phrases = {'wonder', 'tell me', 'know', 'explain'}

    def preprocess(self, sentence: str) -> str:
        """Preprocess the input sentence."""
        return sentence.lower().strip()

    def has_question_mark(self, sentence: str) -> bool:
        """Check if sentence ends with question mark."""
        return sentence.strip().endswith('?')

    def starts_with_auxiliary(self, tokens: List[str]) -> bool:
        """Check if sentence starts with auxiliary verb."""
        return tokens[0] in self.auxiliaries

    def is_wh_question(self, tokens: List[str]) -> bool:
        """Check if sentence is a wh-question."""
        return tokens[0].lower() in self.wh_words

    def is_tag_question(self, sentence: str) -> bool:
        """Check if sentence is a tag question."""
        return any(re.match(pattern, sentence.lower()) for pattern in self.tag_patterns)

    def is_alternative_question(self, sentence: str) -> bool:
        """Check if sentence is an alternative question."""
        return ' or ' in sentence.lower() and self.has_question_mark(sentence)

    def is_echo_question(self, tokens: List[str]) -> bool:
        """Check if sentence is an echo question."""
        return (len(tokens) >= 2 and 
                tokens[-2] in self.wh_words and 
                self.has_question_mark(' '.join(tokens)))

    def is_declarative_question(self, sentence: str, tokens: List[str]) -> bool:
        """Check if sentence is a declarative question."""
        return (not self.starts_with_auxiliary(tokens) and 
                not self.is_wh_question(tokens) and 
                self.has_question_mark(sentence))

    def is_indirect_question(self, tokens: List[str]) -> bool:
        """Check if sentence contains an indirect question."""
        return any(word in self.embedded_phrases for word in tokens)

    def is_permission_request(self, tokens: List[str]) -> bool:
        """Check if sentence is a permission/request question."""
        return (tokens[0] in self.permission_words and 
                'you' in tokens[:3])

    def is_leading_question(self, sentence: str) -> bool:
        """Check if sentence is a leading question."""
        return any(phrase in sentence.lower() for phrase in self.leading_phrases)

    def is_confirmation_question(self, sentence: str) -> bool:
        """Check if sentence is a confirmation question."""
        return any(phrase in sentence.lower() for phrase in self.confirmation_phrases)

    def is_rhetorical_question(self, sentence: str) -> bool:
        """Check if sentence appears to be a rhetorical question."""
        return any(pattern in sentence.lower() for pattern in self.rhetorical_patterns)

    def is_multiple_question(self, sentence: str) -> bool:
        """Check if sentence contains multiple questions."""
        return ('and' in sentence.lower() and 
                any(word in sentence.lower() for word in self.wh_words) and 
                self.has_question_mark(sentence))

    def is_negative_question(self, tokens: List[str]) -> bool:
        """Check if sentence is a negative question."""
        negative_starts = {"isn't", "aren't", "wasn't", "weren't", "don't", "doesn't", "didn't", 
                         "haven't", "hasn't", "hadn't", "won't", "wouldn't", "shouldn't", "can't", 
                         "couldn't"}
        return tokens[0] in negative_starts

    def is_choice_question(self, sentence: str) -> bool:
        """Check if sentence is a choice question explicitly listing options."""
        return ((',' in sentence or ' or ' in sentence) and 
                self.starts_with_auxiliary(word_tokenize(sentence)) and 
                self.has_question_mark(sentence))

    def is_yes_no_question(self, tokens: List[str]) -> bool:
        """Check if sentence is a yes/no question."""
        return (self.starts_with_auxiliary(tokens) and 
                self.has_question_mark(' '.join(tokens)) and 
                not any(word in self.wh_words for word in tokens))

    def classify(self, sentence: str) -> Dict[str, bool]:
        """
        Classify a sentence into different question types.
        Returns a dictionary with boolean values for each type.
        """
        preprocessed = self.preprocess(sentence)
        tokens = word_tokenize(preprocessed)
        
        if not tokens:
            return {}

        classifications = {
            'interrogative': self.has_question_mark(sentence),
            'yes_no_question': self.is_yes_no_question(tokens),
            'wh_question': self.is_wh_question(tokens),
            'tag_question': self.is_tag_question(sentence),
            'alternative_question': self.is_alternative_question(sentence),
            'echo_question': self.is_echo_question(tokens),
            'declarative_question': self.is_declarative_question(sentence, tokens),
            'indirect_question': self.is_indirect_question(tokens),
            'permission_request': self.is_permission_request(tokens),
            'leading_question': self.is_leading_question(sentence),
            'confirmation_question': self.is_confirmation_question(sentence),
            'rhetorical_question': self.is_rhetorical_question(sentence),
            'multiple_question': self.is_multiple_question(sentence),
            'negative_question': self.is_negative_question(tokens),
            'choice_question': self.is_choice_question(sentence)
        }
        
        return classifications

def classify_question(sentence: str) -> str:
    """
    Wrapper function to classify a question and return the primary type.
    Returns the first matching question type, or 'unknown' if no match is found.
    """
    classifier = QuestionClassifier()
    results = classifier.classify(sentence)
    
    # Priority order for classification
    priority_order = [
        'wh_question',
        'yes_no_question',
        'tag_question',
        'alternative_question',
        'echo_question',
        'declarative_question',
        'indirect_question',
        'permission_request',
        'leading_question',
        'confirmation_question',
        'rhetorical_question',
        'multiple_question',
        'negative_question',
        'choice_question',
        'interrogative'
    ]
    
    for q_type in priority_order:
        if results.get(q_type, False):
            return q_type
            
    return 'unknown'

# Example usage
def test_classifier():
    """Test function with example sentences."""
    test_sentences = [
        "What is your name?",                    # wh-question
        "Are you coming?",                       # yes/no question
        "You're coming, aren't you?",            # tag question
        "Would you like tea or coffee?",         # alternative question
        "He went where?",                        # echo question
        "You're leaving?",                       # declarative question
        "Do you know where he went?",            # indirect question
        "Could you pass the salt?",              # permission/request
        "Don't you think we should leave?",      # leading question
        "Is that right?",                        # confirmation question
        "How could anyone do such a thing?",     # rhetorical question
        "When and where did this happen?",       # multiple question
        "Aren't you coming?",                    # negative question
        "Should I turn left or right?",          # choice question
    ]
    
    for sentence in test_sentences:
        q_type = classify_question(sentence)
        print(f"Sentence: {sentence}")
        print(f"Classification: {q_type}\n")


In [None]:
import nltk
# nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()[:10000]


def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
print(classifier.classify(dialogue_act_features(line)))