#Integrate workload balancing logic using heuristic or ML approach.

In [4]:
# Robust loader + workload balancing (heuristic + ML) + safe handling of .xls-as-csv
import os
import io
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# ---------- Config ----------
folder = r"C:\Users\dell\Downloads\project"
candidates = ["Cleaned_AI_Task_Data.xls", "Cleaned_AI_Task_Data.xlsx", "Cleaned_AI_Task_Data.csv",
              "Cleaned_AI_Task_Data_NLP.xls", "AI_Powered_Task_Management_System_2000.csv"]

In [8]:
# ---------- 1) find file ----------
file_path = None
for name in candidates:
    p = os.path.join(folder, name)
    if os.path.exists(p):
        file_path = p
        break

if file_path is None:
    # fallback: try to find any file that contains "Cleaned_AI_Task_Data"
    for f in os.listdir(folder):
        if "Cleaned_AI_Task_Data" in f:
            file_path = os.path.join(folder, f)
            break

if file_path is None:
    raise FileNotFoundError(f"No file named Cleaned_AI_Task_Data.* found in {folder}. List files: {os.listdir(folder)[:30]}")

print(" Using file:", file_path)

 Using file: C:\Users\dell\Downloads\project\Cleaned_AI_Task_Data.xls


In [10]:
# ---------- 2) robust load (handle .xls that is actually CSV) ----------
def try_load(path):
    ext = os.path.splitext(path)[1].lower()
    # peek into the file to see if it looks like text CSV
    try:
        with open(path, "rb") as fh:
            start = fh.read(4096)
        start_text = start.decode("utf-8", errors="ignore")
    except Exception:
        start_text = ""

    # If content contains commas + newline header -> treat as CSV
    looks_like_csv = ("," in start_text and "\n" in start_text and start_text.strip().splitlines()[0].count(",")>=1)

    if ext == ".csv" or looks_like_csv:
        # try csv
        try:
            return pd.read_csv(path)
        except Exception as e_csv:
            # try alternative encodings
            for enc in ("utf-8", "latin1", "cp1252"):
                try:
                    return pd.read_csv(path, encoding=enc)
                except Exception:
                    pass
            raise RuntimeError(f"Failed to read as CSV: {e_csv}")
    else:
        # try xlsx then xls engines
        try:
            return pd.read_excel(path, engine="openpyxl")
        except Exception:
            try:
                return pd.read_excel(path, engine="xlrd")
            except Exception as e:
                # as last resort try reading as CSV anyway
                try:
                    return pd.read_csv(path)
                except Exception as e2:
                    raise RuntimeError(f"Failed to read file as Excel or CSV: {e}; fallback CSV failed: {e2}")

df = try_load(file_path)
print(" Loaded. Shape:", df.shape)
print("Columns:", df.columns.tolist())

 Loaded. Shape: (2000, 20)
Columns: ['Task_ID', 'Title', 'Description', 'Assignee', 'Created_Date', 'Due_Date', 'Completed_Date', 'Status', 'Priority', 'Estimated_Hours', 'Actual_Hours', 'Project', 'Labels/Tags', 'Task_Complexity', 'Predicted_Priority', 'Delay_Risk_Score', 'Completion_Probability', 'AI_Recommendation', 'Task_Duration_Days', 'Actual_Duration_Days']


In [12]:
# ---------- 3) ensure we have a priority column (try common alternatives, else create) ----------
priority_candidates = [c for c in df.columns if c.lower() in ("priority", "priority_level", "task_priority", "urgency", "prioritylabel")]
priority_col = priority_candidates[0] if priority_candidates else None

if priority_col:
    print(f" Using existing priority column: '{priority_col}'")
    df.rename(columns={priority_col: "priority"}, inplace=True)
else:
    # try to infer from other columns
    for cand in ["Estimated_Hours", "Estimated Hours", "Est_Hours", "EstimatedTime"]:
        if cand in df.columns:
            est_col = cand
            break
    else:
        est_col = None

    if est_col:
        # create priority from estimated hours (simple rule)
        print(f" No priority column found. Creating 'priority' using '{est_col}' (heuristic).")
        df["priority"] = pd.cut(df[est_col].astype(float),
                                bins=[-1, 3, 7, 1e9],
                                labels=["Low", "Medium", "High"])
        df["priority"] = df["priority"].astype(str)
    else:
        # fallback: random priorities but deterministic seed
        print(" No priority or estimated-hours column found. Creating synthetic 'priority'.")
        np.random.seed(42)
        df["priority"] = np.random.choice(["Low","Medium","High"], size=len(df))


 Using existing priority column: 'Priority'


In [16]:
# ---------- 4) ensure we have an assignee/employee column for ML target (create if absent) ----------
assignee_candidates = [c for c in df.columns if c.lower() in ("assignee","assigned_to","assigned","owner","employee")]
assignee_col = assignee_candidates[0] if assignee_candidates else None

if assignee_col:
    df.rename(columns={assignee_col: "assignee"}, inplace=True)
else:
    # create synthetic assignees for modeling demo
    print(" No assignee column found. Creating synthetic 'assignee' values for ML demo.")
    np.random.seed(1)
    df["assignee"] = np.random.choice(["Alice","Bob","Charlie","David"], size=len(df))

In [18]:
# ---------- 5) Ensure an Estimated_Hours column exists (create if missing) ----------
if "Estimated_Hours" not in df.columns:
    # try alternatives
    alt = None
    for c in df.columns:
        if "hour" in c.lower() or "time" in c.lower() or "est" in c.lower():
            alt = c; break
    if alt:
        df.rename(columns={alt: "Estimated_Hours"}, inplace=True)
    else:
        # create synthetic estimated hours for demo
        np.random.seed(2)
        df["Estimated_Hours"] = np.random.randint(1, 10, size=len(df))


In [22]:
# ---------- 6) Prepare data for ML assignment (encode safely) ----------
# Encode priority but keep label encoder for unseen handling
priority_le = LabelEncoder()
df["priority_enc"] = priority_le.fit_transform(df["priority"].astype(str))

# Encode assignee target
assignee_le = LabelEncoder()
df["assignee_enc"] = assignee_le.fit_transform(df["assignee"].astype(str))

# Features for assignment model: priority_enc + Estimated_Hours (you can add more)
X = df[["priority_enc", "Estimated_Hours"]]
y = df["assignee_enc"]

# train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y if len(np.unique(y))>1 else None)

rf = RandomForestClassifier(n_estimators=150, random_state=42)
rf.fit(X_train, y_train)
print(" Assignment model trained. Test accuracy:", round(accuracy_score(y_test, rf.predict(X_test)), 4))
print(classification_report(y_test, rf.predict(X_test), target_names=assignee_le.classes_))

 Assignment model trained. Test accuracy: 0.105
              precision    recall  f1-score   support

       Alice       0.00      0.00      0.00        53
         Bob       0.05      0.04      0.04        47
     Charlie       0.13      0.12      0.13        49
       Diana       0.11      0.18      0.14        51
       Ethan       0.12      0.13      0.13        52
       Fiona       0.12      0.11      0.11        47
      George       0.11      0.12      0.12        51
      Hannah       0.13      0.14      0.14        50

    accuracy                           0.10       400
   macro avg       0.10      0.11      0.10       400
weighted avg       0.10      0.10      0.10       400



In [26]:
# ---------- 7) Heuristic workload summary ----------
workload = df.groupby("assignee")["Estimated_Hours"].sum().to_dict()
print(" Current workloads (hours):", workload)

 Current workloads (hours): {'Alice': 2888, 'Bob': 2649, 'Charlie': 2750, 'Diana': 2912, 'Ethan': 2875, 'Fiona': 2516, 'George': 2785, 'Hannah': 2859}


In [28]:

# ---------- 8) Incoming tasks (example) ----------
incoming = pd.DataFrame([
    {"Task_ID": 1001, "priority": "High", "Estimated_Hours": 6},
    {"Task_ID": 1002, "priority": "Critical", "Estimated_Hours": 4},   # unseen label example
    {"Task_ID": 1003, "priority": "Low", "Estimated_Hours": 2},
])

# Handle unseen priority labels: extend label encoder classes_ if needed
new_labels = [p for p in incoming["priority"].unique() if p not in priority_le.classes_]
if new_labels:
    print(" Adding unseen priority labels to encoder:", new_labels)
    priority_le.classes_ = np.append(priority_le.classes_, new_labels)

# encode incoming priorities
incoming["priority_enc"] = priority_le.transform(incoming["priority"].astype(str))

# ML suggestions (predict assignee index -> decode to name)
incoming["ML_assignee_enc"] = rf.predict(incoming[["priority_enc", "Estimated_Hours"]])
incoming["ML_Assignee"] = assignee_le.inverse_transform(incoming["ML_assignee_enc"])

# Heuristic assignment: always assign to person with lowest current workload (updates as we assign)
def heuristic_assign(row, workload_map):
    # pick least-loaded
    assignee = min(workload_map, key=workload_map.get)
    workload_map[assignee] += row["Estimated_Hours"]
    return assignee

# copy workload for heuristic simulation
hw = workload.copy()
# ensure all potential assignees present
for a in assignee_le.classes_:
    hw.setdefault(a, 0)

incoming["Heuristic_Assignee"] = incoming.apply(lambda r: heuristic_assign(r, hw), axis=1)


In [30]:
# ---------- 9) Combine / Display results ----------
print("\n Incoming task assignments:")
print(incoming[["Task_ID","priority","Estimated_Hours","ML_Assignee","Heuristic_Assignee"]])

# Optional: choose logic to pick final assignee:
# Example rule: if ML and heuristic agree -> use ML; if disagree -> use heuristic;
# or use ML probability/confidence if available.
# Here we will display both and pick heuristic as final to ensure balanced load:
incoming["Final_Assignee"] = incoming["Heuristic_Assignee"]

print("\n Final assignments (heuristic chosen to balance workload):")
print(incoming[["Task_ID","Final_Assignee"]])

# ---------- 10) Save model(s) if you want ----------
# import joblib
# joblib.dump(rf, "assignment_rf.pkl")
# joblib.dump(priority_le, "priority_le.pkl")
# joblib.dump(assignee_le, "assignee_le.pkl")


 Incoming task assignments:
   Task_ID  priority  Estimated_Hours ML_Assignee Heuristic_Assignee
0     1001      High                6         Bob              Fiona
1     1002  Critical                4      George              Fiona
2     1003       Low                2       Diana              Fiona

 Final assignments (heuristic chosen to balance workload):
   Task_ID Final_Assignee
0     1001          Fiona
1     1002          Fiona
2     1003          Fiona
