#Apply GridSearchCV for hyperparameter tuning.

In [30]:
# GridSearchCV with robust file loading (handles .xls that is actually CSV)
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score



In [32]:
# ---------- Config: update folder if needed ----------
folder = r"C:\Users\dell\Downloads\project"

# List of possible dataset file names
candidates = [
    "Cleaned_AI_Task_Data.xls",
    "Cleaned_AI_Task_Data.xlsx",
    "Cleaned_AI_Task_Data.csv",
    "Cleaned_AI_Task_Data_NLP.xls",
    "AI_Powered_Task_Management_System_2000.csv"
]




In [34]:
# ---------- 1) find file ----------
file_path = None
for name in candidates:
    p = os.path.join(folder, name)
    if os.path.exists(p):
        file_path = p
        break

# fallback: any file that contains the base name
if file_path is None:
    for f in os.listdir(folder):
        if "Cleaned_AI_Task_Data" in f:
            file_path = os.path.join(folder, f)
            break

if file_path is None:
    raise FileNotFoundError(f"No `Cleaned_AI_Task_Data.*` found in {folder}. Files: {os.listdir(folder)[:30]}")

print("Using file:", file_path)

Using file: C:\Users\dell\Downloads\project\Cleaned_AI_Task_Data.xls


In [36]:
# ---------- 2) robust loader: read CSV if content looks like CSV even if extension is .xls ----------
def robust_read(path):
    ext = os.path.splitext(path)[1].lower()
    # peek first bytes
    try:
        with open(path, "rb") as fh:
            start = fh.read(4096)
        text = start.decode("utf-8", errors="ignore")
    except Exception:
        text = ""

    looks_like_csv = ("," in text and "\n" in text and text.strip().splitlines()[0].count(",") >= 1)

    if ext == ".csv" or looks_like_csv:
        # try csv read with fallback encodings
        for enc in ("utf-8", "latin1", "cp1252"):
            try:
                return pd.read_csv(path, encoding=enc)
            except Exception:
                pass
        raise RuntimeError("Failed to read file as CSV (tried utf-8, latin1, cp1252).")
    else:
        # try xlsx then xls engines
        try:
            return pd.read_excel(path, engine="openpyxl")
        except Exception:
            try:
                # xlrd may not support some formats; attempt anyway
                return pd.read_excel(path, engine="xlrd")
            except Exception:
                # last resort, try as CSV
                try:
                    return pd.read_csv(path)
                except Exception as e:
                    raise RuntimeError(f"Failed to read file as Excel or CSV: {e}")

# load
df = robust_read(file_path)
print("Loaded. Shape:", df.shape)
print("Columns:", df.columns.tolist())

Loaded. Shape: (2000, 20)
Columns: ['Task_ID', 'Title', 'Description', 'Assignee', 'Created_Date', 'Due_Date', 'Completed_Date', 'Status', 'Priority', 'Estimated_Hours', 'Actual_Hours', 'Project', 'Labels/Tags', 'Task_Complexity', 'Predicted_Priority', 'Delay_Risk_Score', 'Completion_Probability', 'AI_Recommendation', 'Task_Duration_Days', 'Actual_Duration_Days']


In [38]:
# ---------- 3) Ensure required columns exist (priority, assignee, Estimated_Hours) ----------
# Normalize column names
cols_lower = [c.lower().strip() for c in df.columns]
col_map = {c: df.columns[i] for i, c in enumerate(cols_lower)}

def find_col(possible_names):
    for name in possible_names:
        if name.lower() in col_map:
            return col_map[name.lower()]
    return None

priority_col = find_col(["priority", "priority_level", "task_priority", "urgency"])
assignee_col = find_col(["assignee", "assigned_to", "assigned", "owner", "employee"])
est_hours_col = find_col(["estimated_hours", "estimated hours", "est_hours", "estimated_hours", "estimated"])

# create or rename to standard names
if priority_col:
    df.rename(columns={priority_col: "priority"}, inplace=True)
else:
    if est_hours_col:
        print("No priority column found — creating by Estimated_Hours heuristic.")
        df.rename(columns={est_hours_col: "Estimated_Hours"}, inplace=True)
        df["priority"] = pd.cut(df["Estimated_Hours"].astype(float),
                                bins=[-1, 3, 7, 1e9],
                                labels=["Low", "Medium", "High"]).astype(str)
    else:
        print("No priority or estimated-hours column found — creating synthetic priority.")
        np.random.seed(42)
        df["priority"] = np.random.choice(["Low", "Medium", "High"], size=len(df))

if assignee_col:
    df.rename(columns={assignee_col: "assignee"}, inplace=True)
else:
    print("No assignee column found — creating synthetic assignees for training.")
    np.random.seed(1)
    df["assignee"] = np.random.choice(["Alice","Bob","Charlie","David"], size=len(df))

if "Estimated_Hours" not in df.columns:
    # try to find any numeric/time-like column; else create synthetic
    possible_numeric = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    if possible_numeric:
        df.rename(columns={possible_numeric[0]: "Estimated_Hours"}, inplace=True)
        print(f" Renamed numeric column '{possible_numeric[0]}' to 'Estimated_Hours'.")
    else:
        print("No numeric column found — creating synthetic Estimated_Hours.")
        np.random.seed(2)
        df["Estimated_Hours"] = np.random.randint(1, 10, size=len(df))

# show head
print("\nPreview (first 5 rows):")
print(df.head())


Preview (first 5 rows):
   Task_ID                      Title  \
0        1              Data Analysis   
1        2                Backend API   
2        3              Data Analysis   
3        4              Data Analysis   
4        5  User Feedback Integration   

                                         Description assignee Created_Date  \
0     Documentation task for Data Analytics project.   George   2024-07-19   
1       Backend API task for Data Analytics project.    Diana   2024-10-14   
2  User Feedback Integration task for Smart Workf...   George   2024-06-18   
3  User Feedback Integration task for Data Analyt...    Ethan   2024-06-13   
4    Documentation task for AI Task Manager project.    Fiona   2024-04-06   

     Due_Date Completed_Date     Status  priority  Estimated_Hours  \
0  2024-07-29     2024-07-29  Completed  Critical                3   
1  2024-10-23     2024-10-23  Completed    Medium               20   
2  2024-06-26     2024-06-26  Completed      High

In [40]:
# ---------- 4) Prepare features & labels ----------
# Encode priority and assignee
priority_le = LabelEncoder()
assignee_le = LabelEncoder()

df["priority_enc"] = priority_le.fit_transform(df["priority"].astype(str))
df["assignee_enc"] = assignee_le.fit_transform(df["assignee"].astype(str))

X = df[["priority_enc", "Estimated_Hours"]]
y = df["assignee_enc"]

# If too few classes or samples, handle gracefully
if len(df) < 5 or len(np.unique(y)) < 2:
    raise ValueError("Not enough data/classes to run GridSearchCV. Need >=2 assignee classes and enough samples.")

In [42]:
# ---------- 5) Train/test split ----------
stratify = y if len(np.unique(y)) > 1 and len(df) >= 10 else None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=stratify)

# ---------- 6) Define a reasonable parameter grid ----------
param_grid = {
    "n_estimators": [50, 100, 150],
    "max_depth": [None, 6, 12],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}

rf = RandomForestClassifier(random_state=42)

In [44]:
# ---------- 7) GridSearchCV (3-fold to save time) ----------
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)

print("\nRunning GridSearchCV... (this can take some minutes depending on data size)")
grid.fit(X_train, y_train)

print("\n GridSearchCV complete.")
print("Best params:", grid.best_params_)
best_rf = grid.best_estimator_



Running GridSearchCV... (this can take some minutes depending on data size)
Fitting 3 folds for each of 36 candidates, totalling 108 fits

 GridSearchCV complete.
Best params: {'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [45]:
# ---------- 8) Evaluate on test set ----------
y_pred = best_rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy (best RF): {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=assignee_le.classes_))

# Optionally save the tuned model
# import joblib
# joblib.dump(best_rf, "best_rf_assignment_model.pkl")


Test Accuracy (best RF): 0.0850

Classification Report:
              precision    recall  f1-score   support

       Alice       0.06      0.04      0.04        53
         Bob       0.02      0.02      0.02        47
     Charlie       0.05      0.04      0.04        49
       Diana       0.10      0.18      0.13        51
       Ethan       0.10      0.08      0.09        52
       Fiona       0.12      0.09      0.10        47
      George       0.12      0.16      0.13        51
      Hannah       0.11      0.08      0.09        50

    accuracy                           0.09       400
   macro avg       0.08      0.08      0.08       400
weighted avg       0.08      0.09      0.08       400

