## Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Loading Dataset

In [2]:
df = pd.read_csv("./enhanced_synthetic_task_dataset.csv")

In [3]:
df.head(5)

Unnamed: 0,task_id,task_description,priority,deadline,assigned_to,status,created_at,estimated_hours,actual_hours
0,TASK_00001,tv long impact need among difference get exper...,Low,2025-07-04,,To Do,2025-05-28,9.179174,9.174231
1,TASK_00002,everything security institution community stud...,Medium,2025-05-15,user_15,To Do,2025-05-04,14.947183,16.873465
2,TASK_00003,size through do drop everybody. please do it asap,High,2025-06-05,user_56,To Do,2025-04-07,10.874983,10.951447
3,TASK_00004,century evening medical wife wonder hit baby. ...,Medium,2025-06-24,user_44,,2025-05-02,7.166649,7.679903
4,TASK_00005,church appear score management baby.,Medium,2025-05-13,user_3,In Progress,2025-04-15,5.725363,10.407275


In [4]:
df.shape

(10000, 9)

## check for Missing Values

In [5]:
df.isnull().sum()

task_id               0
task_description    500
priority            500
deadline              0
assigned_to         500
status              500
created_at            0
estimated_hours       0
actual_hours          0
dtype: int64

In [6]:
df.drop_duplicates(inplace=True)
df['task_description'].fillna("No description provided", inplace=True)
df['priority'].fillna(df['priority'].mode()[0], inplace=True)
df['assigned_to'].fillna("unassigned", inplace=True)
df['status'].fillna(df['status'].mode()[0], inplace=True)

df.isnull().sum()

task_id             0
task_description    0
priority            0
deadline            0
assigned_to         0
status              0
created_at          0
estimated_hours     0
actual_hours        0
dtype: int64

## Target Variable : Priority

#### We'll classify tasks based on a target variable. Assuming priority is the target (Low, Medium, High)

In [7]:
label_encoder = LabelEncoder()
df['priority_encoded'] = label_encoder.fit_transform(df['priority'])

In [8]:
df.head()

Unnamed: 0,task_id,task_description,priority,deadline,assigned_to,status,created_at,estimated_hours,actual_hours,priority_encoded
0,TASK_00001,tv long impact need among difference get exper...,Low,2025-07-04,unassigned,To Do,2025-05-28,9.179174,9.174231,2
1,TASK_00002,everything security institution community stud...,Medium,2025-05-15,user_15,To Do,2025-05-04,14.947183,16.873465,3
2,TASK_00003,size through do drop everybody. please do it asap,High,2025-06-05,user_56,To Do,2025-04-07,10.874983,10.951447,1
3,TASK_00004,century evening medical wife wonder hit baby. ...,Medium,2025-06-24,user_44,To Do,2025-05-02,7.166649,7.679903,3
4,TASK_00005,church appear score management baby.,Medium,2025-05-13,user_3,In Progress,2025-04-15,5.725363,10.407275,3


In [9]:
# View encoded classes
print(label_encoder.classes_)

['Critical' 'High' 'Low' 'Medium']


## Feature Extraction using TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text column to TF-IDF
tfidf = TfidfVectorizer(max_features=3000)
X_tfidf = tfidf.fit_transform(df['task_description'])

# Features (X) and target (y)
X = X_tfidf
y = df['priority_encoded']

In [11]:
X

<10000x975 sparse matrix of type '<class 'numpy.float64'>'
	with 99472 stored elements in Compressed Sparse Row format>

In [12]:
y

0       2
1       3
2       1
3       3
4       3
       ..
9995    3
9996    3
9997    3
9998    3
9999    3
Name: priority_encoded, Length: 10000, dtype: int32

## Train Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train

<8000x975 sparse matrix of type '<class 'numpy.float64'>'
	with 79728 stored elements in Compressed Sparse Row format>

In [15]:
X_test

<2000x975 sparse matrix of type '<class 'numpy.float64'>'
	with 19744 stored elements in Compressed Sparse Row format>

In [16]:
y_train

9254    3
1561    3
1670    3
6087    0
6669    2
       ..
5734    3
5191    3
5390    3
860     0
7270    1
Name: priority_encoded, Length: 8000, dtype: int32

In [17]:
y_test

6252    1
4684    2
1731    3
4742    3
4521    0
       ..
6412    3
8285    3
7853    3
1095    1
6929    1
Name: priority_encoded, Length: 2000, dtype: int32

## Naive Bayes Classification

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict
y_pred_nb = nb_model.predict(X_test)
priority_accuracy = accuracy_score(y_test, y_pred_nb)

# Evaluate
print("Naive Bayes Results:\n")
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))
print(f"Accuracy: {priority_accuracy * 100:.2f}%\n")

Naive Bayes Results:

[[  0  14   0 178]
 [  0  37   0 527]
 [  0  26   0 348]
 [  0  45   2 823]]
              precision    recall  f1-score   support

    Critical       0.00      0.00      0.00       192
        High       0.30      0.07      0.11       564
         Low       0.00      0.00      0.00       374
      Medium       0.44      0.95      0.60       870

    accuracy                           0.43      2000
   macro avg       0.19      0.25      0.18      2000
weighted avg       0.28      0.43      0.29      2000

Accuracy: 43.00%



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM Classification

In [19]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# Train model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Predict
y_pred_svm = svm_model.predict(X_test)
priority_svm_accuracy = accuracy_score(y_test, y_pred_svm)

# Evaluate
print("SVM Results:\n")
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))
print(f"Accuracy : {priority_svm_accuracy * 100:.2f}%\n")

SVM Results:

[[  1  58  16 117]
 [  5 144  62 353]
 [  6 101  35 232]
 [ 16 205  76 573]]
              precision    recall  f1-score   support

    Critical       0.04      0.01      0.01       192
        High       0.28      0.26      0.27       564
         Low       0.19      0.09      0.12       374
      Medium       0.45      0.66      0.53       870

    accuracy                           0.38      2000
   macro avg       0.24      0.25      0.23      2000
weighted avg       0.31      0.38      0.33      2000

Accuracy : 37.65%



## Target Variable : Status

#### We'll classify tasks based on a target variable. Assuming status is the target (To Do, In Progress, Done, Blocked)

In [20]:
status_label_encoder = LabelEncoder()
df['status_encoded'] = status_label_encoder.fit_transform(df['status'])

#view encoded class labels
status_label_encoder.classes_

array(['Blocked', 'Done', 'In Progress', 'To Do'], dtype=object)

In [21]:
z = df['status_encoded']

X_train, X_test, z_train, z_test = train_test_split(X, z, test_size=0.3, random_state=42)

In [22]:
z_train

9069    1
2603    2
7738    3
1579    1
5058    3
       ..
5734    0
5191    2
5390    0
860     2
7270    2
Name: status_encoded, Length: 7000, dtype: int32

In [23]:
z_test

6252    3
4684    3
1731    0
4742    2
4521    1
       ..
8014    2
1074    1
3063    0
6487    3
4705    3
Name: status_encoded, Length: 3000, dtype: int32

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train model
nb_model = MultinomialNB()
nb_model.fit(X_train, z_train)

# Predict
z_pred_nb = nb_model.predict(X_test)
status_accuracy = accuracy_score(z_test, z_pred_nb)

# Evaluate
print("Naive Bayes Results:\n")
print(confusion_matrix(z_test, z_pred_nb))
print(classification_report(z_test, z_pred_nb, target_names=label_encoder.classes_))
print(f"Accuracy : {status_accuracy * 100:.2f}%\n")

Naive Bayes Results:

[[   0    0   12  276]
 [   0    1   24  506]
 [   0    3   40  833]
 [   0    1   54 1250]]
              precision    recall  f1-score   support

    Critical       0.00      0.00      0.00       288
        High       0.20      0.00      0.00       531
         Low       0.31      0.05      0.08       876
      Medium       0.44      0.96      0.60      1305

    accuracy                           0.43      3000
   macro avg       0.24      0.25      0.17      3000
weighted avg       0.32      0.43      0.28      3000

Accuracy : 43.03%



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
# SVM Classficiation

from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# Train model
svm_model = LinearSVC()
svm_model.fit(X_train, z_train)

# Predict
z_pred_svm = svm_model.predict(X_test)
status_svm_accuracy = accuracy_score(z_test, z_pred_svm)

# Evaluate
print("SVM Results:\n")
print(confusion_matrix(z_test, z_pred_svm))
print(classification_report(z_test, z_pred_svm, target_names=label_encoder.classes_))
print(f"Accuracy : {status_svm_accuracy * 100:.2f}%\n")

SVM Results:

[[  5  33  66 184]
 [ 11  73 119 328]
 [ 17 106 195 558]
 [ 35 152 293 825]]
              precision    recall  f1-score   support

    Critical       0.07      0.02      0.03       288
        High       0.20      0.14      0.16       531
         Low       0.29      0.22      0.25       876
      Medium       0.44      0.63      0.52      1305

    accuracy                           0.37      3000
   macro avg       0.25      0.25      0.24      3000
weighted avg       0.32      0.37      0.33      3000

Accuracy : 36.60%



## Overall Results till now

| Metrics  | Naive Bayes | SVM    | Target Variable |
| -------- | ----------- | ------ | --------------- |
| Accuracy | 42%         | 37.65% | Priority        |
| Accuracy | 43%         | 37.45% | Status          |

## Check for Imbalanced Classes

In [26]:
print(df['priority'].value_counts())

Medium      4233
High        2904
Low         1915
Critical     948
Name: priority, dtype: int64


In [27]:
print(df['status'].value_counts())

To Do          4341
In Progress    2841
Done           1884
Blocked         934
Name: status, dtype: int64


### By the above results, we can get to know the classes are imbalanced. So, for that we try to balance the classes

In [28]:
from sklearn.utils import resample

# Example: Upsample minority classes
df_majority = df[df['priority'] == 'Low']
df_minority = df[df['priority'] != 'Low']

df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])

In [29]:
print(df_balanced['priority'].value_counts())

Low         1915
Medium      1010
High         671
Critical     234
Name: priority, dtype: int64


### Still, the classes are imbalanced, let's try to add more features for classification

In [30]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

# Convert categorical features
encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(df[['assigned_to', 'status']])

# Combine with TF-IDF matrix
A = hstack([X_tfidf, categorical_features])


# Train Test Split
A_train, A_test, y_train, y_test = train_test_split(A, y, test_size=0.3, random_state=42)

# Classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train model
nb_model = MultinomialNB()
nb_model.fit(A_train, y_train)

# Predict
y_pred_nb = nb_model.predict(A_test)
priority_accuracy = accuracy_score(y_test, y_pred_nb)

# Evaluate
print("Naive Bayes Results:\n")
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))
print(f"Accuracy: {priority_accuracy * 100:.2f}%\n")

Naive Bayes Results:

[[   0   56    4  227]
 [   0  157   13  667]
 [   0  106   10  452]
 [   0  211   15 1082]]
              precision    recall  f1-score   support

    Critical       0.00      0.00      0.00       287
        High       0.30      0.19      0.23       837
         Low       0.24      0.02      0.03       568
      Medium       0.45      0.83      0.58      1308

    accuracy                           0.42      3000
   macro avg       0.24      0.26      0.21      3000
weighted avg       0.32      0.42      0.32      3000

Accuracy: 41.63%



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Try with Hyperparameter Tuning

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

grid = GridSearchCV(SVC(), param_grid, cv=3)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)

Best params: {'C': 0.1, 'kernel': 'linear'}


In [32]:
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score

# Best params from your tuning
best_c = 0.1
best_kernel = 'linear'

# Initialize the final model with the best parameters
final_svm_model = SVC(C=best_c, kernel=best_kernel, random_state=42) # Add random_state for reproducibility

# Train the final model on the entire training data
# Make sure X_train and y_train are your full training sets (not just a fold)
final_svm_model.fit(X_train, y_train)

print("Final SVM model trained with best hyperparameters.")

# Make predictions on the unseen test data
y_pred_final_svm = final_svm_model.predict(X_test)
Updated_accuracy = accuracy_score(y_test, y_pred_final_svm)

# Evaluate the final model
print("\nFinal SVM Model Performance on Test Set:")
print(f"Accuracy : {Updated_accuracy * 100:.2f}%\n")

Final SVM model trained with best hyperparameters.

Final SVM Model Performance on Test Set:
Accuracy : 43.60%



## Advanced Resampling Technique

### Step 1: Preparing Data (Pre-Resampling)

In [33]:
!pip install imbalanced-learn

from sklearn.model_selection import train_test_split

# For reproducibility
RANDOM_SEED = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y) # stratify=y is crucial for imbalanced data)
                                                    
print(f"Original X_train shape: {X_train.shape}")
print(f"Original y_train shape: {y_train.shape}")
print(f"Original y_test shape: {y_test.shape}")
                                                    
# Check initial class distribution in y_train (optional, but good practice)
from collections import Counter
print("\nClass distribution in original y_train:")
print(Counter(y_train))                                                 

Defaulting to user installation because normal site-packages is not writeable
Original X_train shape: (8000, 975)
Original y_train shape: (8000,)
Original y_test shape: (2000,)

Class distribution in original y_train:
Counter({3: 3387, 1: 2323, 2: 1532, 0: 758})


### Step - 2 : Apply SMOTE to the Training Data

In [34]:
from imblearn.over_sampling import SMOTE
import numpy as np # For potential use with sparse matrices if needed

smote = SMOTE(random_state=RANDOM_SEED)

# Apply SMOTE to your training data ONLY
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"\nResampled X_train shape: {X_train_resampled.shape}")
print(f"Resampled y_train shape: {y_train_resampled.shape}")

print("\nClass distribution in resampled y_train:")
print(Counter(y_train_resampled))


Resampled X_train shape: (13548, 975)
Resampled y_train shape: (13548,)

Class distribution in resampled y_train:
Counter({3: 3387, 0: 3387, 1: 3387, 2: 3387})


### Step - 3 : Train the model on resampled data

In [35]:
from sklearn.svm import SVC # Or LinearSVC, depending on what you decided for best params
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder # Assuming you still need this for target_names

final_svm_model_smote = SVC(C=0.1, kernel='linear', random_state=RANDOM_SEED) # Assuming these are your best params

# Train the model on the RESAMPLED training data
final_svm_model_smote.fit(X_train_resampled, y_train_resampled)

print("Final SVM model trained with SMOTE-resampled data.")

# Make predictions on the unseen test data
y_pred_smote_svm = final_svm_model_smote.predict(X_test)
resampled_accuracy = accuracy_score(y_test, y_pred_smote_svm)

# Evaluate the final model
print("\nFinal SVM Model Performance on Original Test Set (after SMOTE):")
print(f"Accuracy : {resampled_accuracy * 100:.2f}%\n")
print(confusion_matrix(y_test, y_pred_smote_svm))
print(classification_report(y_test, y_pred_smote_svm, target_names=label_encoder.classes_))

Final SVM model trained with SMOTE-resampled data.

Final SVM Model Performance on Original Test Set (after SMOTE):
Accuracy : 27.10%

[[ 31  51  56  52]
 [ 96 134 167 184]
 [ 68  96 107 112]
 [169 178 229 270]]
              precision    recall  f1-score   support

    Critical       0.09      0.16      0.11       190
        High       0.29      0.23      0.26       581
         Low       0.19      0.28      0.23       383
      Medium       0.44      0.32      0.37       846

    accuracy                           0.27      2000
   macro avg       0.25      0.25      0.24      2000
weighted avg       0.31      0.27      0.29      2000



## Accuracy Dropped, So let's try with another technique

In [36]:
# Use Class Weights (instead of SMOTE)

from sklearn.svm import SVC

svm = SVC(kernel='linear', C=0.1, class_weight='balanced')
svm.fit(X_train, y_train)

# Switch to Tree-based Models (Handle imbalance better)

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


# Evaluate Using Per-Class F1-Scores, Not Just Accuracy

from sklearn.metrics import classification_report, accuracy_score

rf_accuracy = accuracy_score(y_test, y_pred_rf)

print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))
print(f"Accuracy : {rf_accuracy * 100:.2f}%\n")

              precision    recall  f1-score   support

    Critical       0.12      0.01      0.01       190
        High       0.29      0.14      0.19       581
         Low       0.20      0.10      0.14       383
      Medium       0.43      0.77      0.55       846

    accuracy                           0.39      2000
   macro avg       0.26      0.25      0.22      2000
weighted avg       0.32      0.39      0.32      2000

Accuracy : 38.60%



## Trial with Random Sampler Techniques

In [37]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# ========== RANDOM OVERSAMPLING ==========
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)

X_train_ros, X_test_ros, y_train_ros, y_test_ros = train_test_split(X_ros, y_ros, test_size=0.2, random_state=42, stratify=y_ros)

model_ros = MultinomialNB()
model_ros.fit(X_train_ros, y_train_ros)
y_pred_ros = model_ros.predict(X_test_ros)

print("=== Random OverSampler Results ===")
print("Accuracy:", accuracy_score(y_test_ros, y_pred_ros))
print(classification_report(y_test_ros, y_pred_ros, target_names=label_encoder.classes_))
print(confusion_matrix(y_test_ros, y_pred_ros))


# ========== RANDOM UNDERSAMPLING ==========
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_rus, y_rus, test_size=0.2, random_state=42, stratify=y_rus)

model_rus = MultinomialNB()
model_rus.fit(X_train_rus, y_train_rus)
y_pred_rus = model_rus.predict(X_test_rus)

print("\n=== Random UnderSampler Results ===")
print("Accuracy:", accuracy_score(y_test_rus, y_pred_rus))
print(classification_report(y_test_rus, y_pred_rus, target_names=label_encoder.classes_))
print(confusion_matrix(y_test_rus, y_pred_rus))


=== Random OverSampler Results ===
Accuracy: 0.3720106288751107
              precision    recall  f1-score   support

    Critical       0.41      0.53      0.46       847
        High       0.36      0.30      0.33       846
         Low       0.37      0.37      0.37       847
      Medium       0.33      0.29      0.31       847

    accuracy                           0.37      3387
   macro avg       0.37      0.37      0.37      3387
weighted avg       0.37      0.37      0.37      3387

[[451 120 138 138]
 [217 253 194 182]
 [197 163 313 174]
 [240 173 191 243]]

=== Random UnderSampler Results ===
Accuracy: 0.27140974967061926
              precision    recall  f1-score   support

    Critical       0.25      0.27      0.26       190
        High       0.27      0.30      0.28       189
         Low       0.30      0.26      0.28       190
      Medium       0.28      0.25      0.26       190

    accuracy                           0.27       759
   macro avg       0.27      0.

## Let's switch to class weighted svm

In [38]:
from sklearn.svm import LinearSVC
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
weight_dict = dict(zip(np.unique(y), class_weights))

# Train/test split on original cleaned data (no oversampling)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

svm = LinearSVC(class_weight=weight_dict, max_iter=10000)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.3055
              precision    recall  f1-score   support

    Critical       0.10      0.16      0.12       190
        High       0.30      0.25      0.27       581
         Low       0.22      0.21      0.22       383
      Medium       0.42      0.42      0.42       846

    accuracy                           0.31      2000
   macro avg       0.26      0.26      0.26      2000
weighted avg       0.32      0.31      0.31      2000

[[ 30  46  35  79]
 [ 80 145  95 261]
 [ 50  98  82 153]
 [139 188 165 354]]
