## Importing Libraries

In [2]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\reddy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\reddy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

## Loading Dataset

In [3]:
df = pd.read_csv("./enhanced_synthetic_task_dataset.csv")

In [4]:
df.head(10)

Unnamed: 0,task_id,task_description,priority,deadline,assigned_to,status,created_at,estimated_hours,actual_hours
0,TASK_00001,tv long impact need among difference get exper...,Low,2025-07-04,,To Do,2025-05-28,9.179174,9.174231
1,TASK_00002,everything security institution community stud...,Medium,2025-05-15,user_15,To Do,2025-05-04,14.947183,16.873465
2,TASK_00003,size through do drop everybody. please do it asap,High,2025-06-05,user_56,To Do,2025-04-07,10.874983,10.951447
3,TASK_00004,century evening medical wife wonder hit baby. ...,Medium,2025-06-24,user_44,,2025-05-02,7.166649,7.679903
4,TASK_00005,church appear score management baby.,Medium,2025-05-13,user_3,In Progress,2025-04-15,5.725363,10.407275
5,TASK_00006,trial spring human carry perhaps phone week.,Low,2025-06-10,user_23,Blocked,2025-05-27,3.601126,5.683852
6,TASK_00007,,Medium,2025-07-06,user_4,To Do,2025-05-16,4.422873,8.642398
7,TASK_00008,scene return begin region son value protect in...,Medium,2025-05-30,user_4,To Do,2025-04-10,7.818175,6.997669
8,TASK_00009,product health wish partner across represent m...,Medium,2025-04-22,user_68,To Do,2025-04-12,4.575585,6.275073
9,TASK_00010,account my free whose suggest. thanks!,Low,2025-04-09,user_63,,2025-04-07,6.935844,6.766721


## Initializing Tools

In [5]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [6]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [7]:
stemmer

<PorterStemmer>

## Preprocessing to task_description column

In [8]:
def preprocess_text(text):
    # 1. Normalize: Lowercase & remove punctuation
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    
    # 2. Tokenize
    tokens = nltk.word_tokenize(text)
    
    # 3. Remove stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # 4. Apply Stemming
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # Return Joined and cleaned Text
    return " ".join(stemmed_tokens)

df['cleaned_description'] = df['task_description'].astype(str).apply(preprocess_text)

In [9]:
df[['task_description', 'cleaned_description']].head(10)

Unnamed: 0,task_description,cleaned_description
0,tv long impact need among difference get exper...,tv long impact need among differ get experi he...
1,everything security institution community stud...,everyth secur institut commun student everyth ...
2,size through do drop everybody. please do it asap,size drop everybodi pleas asap
3,century evening medical wife wonder hit baby. ...,centuri even medic wife wonder hit babi thank
4,church appear score management baby.,church appear score manag babi
5,trial spring human carry perhaps phone week.,trial spring human carri perhap phone week
6,,
7,scene return begin region son value protect in...,scene return begin region son valu protect insid
8,product health wish partner across represent m...,product health wish partner across repres meas...
9,account my free whose suggest. thanks!,account free whose suggest thank


## Done with NLP preprocessing on Task Description

### Retrieving the Dataset

In [10]:
df.head(5)

Unnamed: 0,task_id,task_description,priority,deadline,assigned_to,status,created_at,estimated_hours,actual_hours,cleaned_description
0,TASK_00001,tv long impact need among difference get exper...,Low,2025-07-04,,To Do,2025-05-28,9.179174,9.174231,tv long impact need among differ get experi he...
1,TASK_00002,everything security institution community stud...,Medium,2025-05-15,user_15,To Do,2025-05-04,14.947183,16.873465,everyth secur institut commun student everyth ...
2,TASK_00003,size through do drop everybody. please do it asap,High,2025-06-05,user_56,To Do,2025-04-07,10.874983,10.951447,size drop everybodi pleas asap
3,TASK_00004,century evening medical wife wonder hit baby. ...,Medium,2025-06-24,user_44,,2025-05-02,7.166649,7.679903,centuri even medic wife wonder hit babi thank
4,TASK_00005,church appear score management baby.,Medium,2025-05-13,user_3,In Progress,2025-04-15,5.725363,10.407275,church appear score manag babi
