In [1]:
import re
import spacy
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saten\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

In [3]:
# Sample Input Text
text = """
Rahul wakes up early every day. He goes to college in the morning and comes back at 3 pm.
At present, Rahul is outside. He has to buy the snacks for all of us. John should finish the report by 5 pm.
Anita needs to schedule a meeting for tomorrow.
"""

In [4]:
# Preprocessing Function
def preprocess_text(text):
    """
    Convert text to lowercase, remove punctuation, and tokenize into sentences.
    Challenge: Some sentence tokenization may not work correctly if punctuation is missing.
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]  # Use spaCy's sentence segmentation
    return sentences

In [5]:
# Extract Tasks using Heuristics
def extract_tasks(sentences):
    """
    Identify task-related sentences by checking for action words.
    Challenge: Some tasks may not explicitly contain 'to', 'should', or 'needs to', leading to false negatives.
    """
    task_sentences = []
    for sentence in sentences:
        doc = nlp(sentence)
        verbs = [token.text for token in doc if token.pos_ == "VERB"]
        if any(verb in sentence for verb in verbs):  # Check for action words
            if "to" in sentence or "should" in sentence or "needs to" in sentence:
                task_sentences.append(sentence)
    return task_sentences

In [6]:
# Extract Entities (Assignee & Deadline)
def extract_entities(task_sentences):
    """
    Extract names of people (assignee) and deadlines (time, date).
    Challenge: Named entity recognition (NER) might miss informal references to time or names not in training data.
    """
    structured_tasks = []
    for sentence in task_sentences:
        doc = nlp(sentence)
        assignee = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        deadline = [ent.text for ent in doc.ents if ent.label_ in ["TIME", "DATE"]]
        structured_tasks.append({"task": sentence, "assignee": assignee, "deadline": deadline})
    return structured_tasks

In [7]:
# Categorization using LDA (Topic Modeling)
def categorize_tasks(task_sentences):
    """
    Categorize tasks into predefined topics using LDA.
    Challenge: LDA requires a sufficient amount of text data to generate meaningful topics.
    """
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(task_sentences)
    lda = LatentDirichletAllocation(n_components=3, random_state=42)
    lda.fit(X)
    topics = lda.transform(X)
    categories = ["Work", "Personal", "Miscellaneous"]
    categorized_tasks = [{"task": task_sentences[i], "category": categories[topics[i].argmax()]} for i in range(len(task_sentences))]
    return categorized_tasks

In [8]:
# Execution
sentences = preprocess_text(text)
tasks = extract_tasks(sentences)
structured_tasks = extract_entities(tasks)
categorized_tasks = categorize_tasks(tasks)

In [9]:
# Output Results
print("Extracted Tasks:", structured_tasks)
print("Categorized Tasks:", categorized_tasks)

Extracted Tasks: [{'task': 'rahul wakes up early every day he goes to college in the morning and comes back at 3 pm\nat present rahul is outside he has to buy the snacks for all of us john should finish the report by 5 pm\nanita needs to schedule a meeting for tomorrow', 'assignee': ['anita'], 'deadline': ['early every day', 'the morning', '3 pm', '5 pm', 'tomorrow']}]
Categorized Tasks: [{'task': 'rahul wakes up early every day he goes to college in the morning and comes back at 3 pm\nat present rahul is outside he has to buy the snacks for all of us john should finish the report by 5 pm\nanita needs to schedule a meeting for tomorrow', 'category': 'Personal'}]
