In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import textwrap
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
data = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/HRDataset_v14.csv')
Target = data['PerfScoreID']

In [None]:
print(data.head)
print("Loaded shape:", data.shape)
print("Columns sample:", data.columns[:36].tolist())

In [None]:
print(data.describe())

In [None]:
data.isnull().sum()

In [None]:
data[data.duplicated()]

In [None]:
print(data.info())

In [None]:
missing_values = ["n/a", "na", "--", "Nan", " ", ".", "?"]
new_data = data.replace(missing_values, np.nan)
print(new_data)

In [None]:
data['DOB'] = pd.to_datetime(data['DOB'])
data['Age'] = 2020 - data['DOB'].dt.year
print(data[['DOB', 'Age']])

In [None]:
data['Age'].unique()

In [None]:
# This keeps ALL rows and simply fixes the invalid ones.
data['Age'] = data['Age'].apply(lambda x: abs(x) if pd.notnull(x) else x)

# --- Step 2: Define age bins and labels ---
bins = [20, 30, 40, 50, 60, float('inf')]
labels = ['20-30', '31-40', '41-50', '51-60', '60+']

# --- Step 3: Apply binning to create AgeGroup ---
data['AgeGroup'] = pd.cut(
    data['Age'],
    bins=bins,
    labels=labels,
    right=True,
    include_lowest=True
)

# Optional check:
print(data[['Age', 'AgeGroup']].head(10))

In [None]:
import matplotlib.pyplot as plt

sns.set(style="whitegrid")
plt.figure(figsize=(13, 5))

# Plot gender distribution
plt.subplot(1, 2, 1)
sns.countplot(y='GenderID', data=data, hue='GenderID', palette='coolwarm')
plt.title('Gender Distribution')

plt.show()

In [None]:
# Plot race distribution same as above, use palette='viridis'
plt.subplot(1, 2, 2)
sns.countplot(y='RaceDesc', data=data, hue='RaceDesc', palette='viridis')
plt.title('Race Distribution')
plt.show()

In [None]:
plt.subplot(1, 2, 1)
sns.countplot(y='AgeGroup', data=data, hue='AgeGroup', palette='viridis')
plt.title('Age Group')

plt.show()

In [None]:
plt.subplot(1, 2, 1)
sns.countplot(y='Department', data=data, hue='Department', palette='viridis')
plt.title('Department Distribution')

plt.show()

In [None]:
plt.subplot(1, 2, 2)
sns.countplot(y='RecruitmentSource', data=data, hue='RecruitmentSource', palette='viridis')
plt.title('Recruitment Source')
plt.show()

In [None]:
data['DateofHire'] = pd.to_datetime(data['DateofHire'])
data['Tenure'] = (2020 - data['DateofHire'].dt.year)


In [None]:
data = pd.get_dummies(data, columns=['Termd'])

In [None]:
data['Termd_0'].value_counts(normalize=True)

In [None]:
columns_to_drop = [
    'Employee_Name', 'ManagerName', 'Position', 'State', 'Zip', 'Sex', 'ManagerID', 'DeptID', 'RaceDesc', 'AgeGroup', 'GenderID',
    'MaritalDesc', 'HispanicLatino', 'CitizenDesc', 'EmpStatusID', 'Age', 'EngagementSurvey',
    'MarriedID', 'MaritalStatusID', 'EmploymentStatus', 'DateofTermination', 'DOB', 'DateofHire', 'TermReason', 'PerformanceScore', 'LastPerformanceReview_Date'
]
data = data.drop(columns=columns_to_drop, errors='ignore')

In [None]:
data

In [None]:
numeric_features = [
    'Salary', 'PositionID', 'SpecialProjectsCount',
    'DaysLateLast30', 'EmpSatisfaction', 'Absences', 'FromDiversityJobFairID', 'Tenure'
]
categorical_features = [
    'Department', 'RecruitmentSource'
]

# Exclude 'PerfScoreID' from X if it's in the numeric features list, as it is the target variable
X_features = [col for col in (numeric_features + categorical_features) if col != 'PerfScoreID']
X = data[X_features]
y = data['PerfScoreID']

# --- Define Feature Lists --


# --- 1. Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# --- 2. Define Preprocessing for numeric features, impute NaN with 'mean', then scale.---
numeric_preprocessor = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)
# --- 3. Define Preprocessing for categorical features, impute NaN with 'most_frequent', then one-hot encode.---

categorical_preprocessor = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore",sparse_output=False)),
    ]
)

# --- 4. Create the ColumnTransformer that covers both numerical and categorical handling---
preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", numeric_preprocessor, [col for col in numeric_features if col in X_train.columns]),
        ("categorical", categorical_preprocessor, [col for col in categorical_features if col in X_train.columns]),
    ],
    remainder="drop", # Drop any columns not listed
)

# --- 5. Create the Full Pipeline (Preprocess + Model), full_pipeline ---
model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced'
)


full_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("classifier", model)
    ]
)

# --- 6. Fit and Predict with full_pipeline ---
print("Fitting basic Scikit-learn pipeline...")
full_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = full_pipeline.predict(X_test)

# --- 7. Evaluate ---
accuracy = accuracy_score(y_test, y_pred)
print(f"\n Pipeline Accuracy: {accuracy:.4f}")
print("\nClassification report:\n", classification_report(y_test, y_pred))

In [None]:
full_pipeline

In [None]:
def subgroup_report(col):
    for val in sorted(data[col].dropna().unique()):
        idx = X_test.index
        mask = data.loc[idx, col] == val
        if mask.sum() < 5:
            continue
        print(f"\n=== {col} = {val} (n={mask.sum()}) ===")
        print(classification_report(y_test[mask], y_pred[mask]))
print("=== Subgroup Reports ===")
subgroup_report('RecruitmentSource')
subgroup_report('Department')
subgroup_report('Termd_0')
subgroup_report('Termd_1')

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

acc = accuracy_score(y_test, y_pred)
y_proba = full_pipeline.predict_proba(X_test)
auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted')
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", acc)
print("ROC-AUC:", auc)
print("Confusion matrix:\n", cm)
# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Get unique classes for labels
classes = sorted(y_test.unique())

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=classes,
    yticklabels=classes
)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.transform import factor_cmap, jitter
from bokeh.palettes import Category20
from bokeh.io import output_notebook

output_notebook()

source = ColumnDataSource(data)

department_factors = sorted(data["Department"].dropna().astype(str).unique())

p = figure(
    width=600, height=450,
    title="Salary vs. Department",
    x_range=department_factors,
    x_axis_label="Department",
    y_axis_label="Salary",
    tools="pan,wheel_zoom,box_zoom,reset,save,box_select"
)

palette = Category20[len(department_factors)] if len(department_factors) <= 20 else Category20[20]

p_scatter = p.scatter(
    x=jitter("Department", width=0.4, range=p.x_range),
    y="Salary",
    source=source,
    marker="circle",
    size=7,
    alpha=0.7,
    color=factor_cmap("Department", palette=palette, factors=department_factors),
    legend_group="Department"
)

p.legend.location = "top_right"
p.legend.title = "Department"

p.add_tools(HoverTool(
    tooltips=[
        ("Employee", "@EmpID"),
        ("Department", "@Department"),
        ("Salary", "@Salary{0,0}"),
        ("Race", "@RaceDesc"),
        ("Gender", "@GenderID_0"),
        ("Age", "@AgeGroup_20-30"),
        ("Termd", "@Termd_0")
    ]
))

In [None]:
show(p)

In [None]:
from bokeh.models import DataTable, TableColumn
from bokeh.layouts import row
from bokeh.io import output_notebook

output_notebook()

table_columns = [
    TableColumn(field="Employee_Name", title="EmpID"),
    TableColumn(field="RaceDesc", title="Race"),
    TableColumn(field="Department", title="Department"),
    TableColumn(field="Sex", title="Gender"),
    TableColumn(field="Salary", title="Salary")
]

eda_table = DataTable(source=source, columns=table_columns,
                      width=500, height=450,)

# Pass the figure 'p' instead of the renderer 'p_scatter' to the row layout
show(row(p, eda_table))

In [None]:
from bokeh.io import output_notebook
output_notebook()

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource

#target + features

target = "PerfScoreID"

features = [
    'Salary', 'PositionID', 'SpecialProjectsCount', 'DaysLateLast30',
    'EmpSatisfaction', 'Absences', 'FromDiversityJobFairID', 'Tenure',
    'Department', 'RecruitmentSource'
]

# Encode categorical columns
df_encoded = data.copy()
label_encoders = {}

for col in features:
    if df_encoded[col].dtype == "object":
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
        label_encoders[col] = le

# Train Random Forest

X = df_encoded[features]
y = df_encoded[target]

model = RandomForestRegressor(n_estimators=300, random_state=42)
model.fit(X, y)

importances = model.feature_importances_

plot_df = pd.DataFrame({
    "Feature": features,
    "Importance": importances,
}).sort_values("Importance", ascending=False)

source = ColumnDataSource(plot_df)

# Bokeh Interactive Bar Chart

p = figure(
    x_range=list(plot_df["Feature"]),
    height=400,
    title="Top Workplace Factors Influencing Performance",
    toolbar_location="below",
    tools="pan,wheel_zoom,box_zoom,reset"
)

p.vbar(
    x="Feature",
    top="Importance",
    width=0.6,
    source=source,
    color="steelblue",
    alpha=0.85
)

hover = HoverTool(tooltips=[
    ("Factor", "@Feature"),
    ("Influence Score", "@Importance{0.0000}"),
    ("Explanation", "@Feature influences employee performance based on Random Forest importance.")
])

p.add_tools(hover)

p.y_range.start = 0
p.xaxis.major_label_orientation = 1.2
p.xgrid.grid_line_color = None
p.title.align = "center"

show(p)

In [None]:
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, CustomJS
from bokeh.transform import factor_cmap
from bokeh.palettes import Category20 # Import Category20
from bokeh.models import Dropdown
from bokeh.layouts import column

# ----------------------------------
# AGGREGATE DATA
# ----------------------------------
df = data

grouped = df.groupby("Department")["PerfScoreID"].mean().reset_index()
grouped["PerfScoreID"] = grouped["PerfScoreID"].round(2)
grouped["alpha"] = 1.0   # used for interactive opacity

source = ColumnDataSource(grouped)

# ----------------------------------
# FIGURE
# ----------------------------------
p = figure(
    x_range=grouped["Department"],
    title="Average Performance by Department",
    width=900, height=450,
    tools="tap",   # required for click interaction
    toolbar_location=None
)

# ----------------------------------
# BAR GLYPH
# ----------------------------------
# Use Category20 with the correct number of departments
bars = p.vbar(
    x="Department",
    top="PerfScoreID",
    source=source,
    width=0.6,
    fill_color=factor_cmap("Department", Category20[len(grouped["Department"])], grouped["Department"]),
    fill_alpha="alpha",
    line_color="black"
)

# ----------------------------------
# HOVER TOOL — only show dept + avg performance
# ----------------------------------
hover = HoverTool(
    tooltips=[
        ("Department", "@Department"),
        ("Avg Performance", "@PerfScoreID")
    ],
    renderers=[bars]
)
p.add_tools(hover)

# ----------------------------------
# CLICK INTERACTION: DIM ALL OTHER BARS
# ----------------------------------
callback = CustomJS(args=dict(source=source), code="""
    const data = source.data;
    const selected = source.selected.indices;

    // No selection → reset all alphas
    if (selected.length === 0) {
        for (let i = 0; i < data['alpha'].length; i++) {
            data['alpha'][i] = 1.0;
        }
    } else {
        const selected_index = selected[0];
        for (let i = 0; i < data['alpha'].length; i++) {
            data['alpha'][i] = (i === selected_index) ? 1.0 : 0.2;
        }
    }

    source.change.emit();
""")

reset_menu = Dropdown(label="Menu", menu=[("Reset Display", "reset")])

reset_callback = CustomJS(args=dict(source=source), code="""
    const data = source.data;

    // Reset bar opacity
    for (let i = 0; i < data['alpha'].length; i++) {
        data['alpha'][i] = 1.0;
    }

    // Clear selection
    source.selected.indices = [];

    source.change.emit();
""")

reset_menu.js_on_event("menu_item_click", reset_callback)

source.js_on_change('selected', callback)

p.xaxis.major_label_orientation = 1.0

show(column(reset_menu, p))


In [None]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, FactorRange, HoverTool
import pandas as pd
import numpy as np


np.random.seed(42)

n = 500
perf_score = np.random.normal(75, 10, n)  # Performance Score
satisfaction = np.random.choice([1, 2, 3, 4, 5], size=n)  # Satisfaction (1-5 scale)
engagement = np.random.choice([1, 2, 3, 4, 5], size=n)  # Engagement (1-5 scale)
absences = np.random.poisson(2, n)  # Absences (Poisson distribution)


df_plot = pd.DataFrame({
    'PerfScoreID': perf_score,
    'Satisfaction': satisfaction,
    'Engagement': engagement,
    'Absences': absences
})

# Prepare data for box plots
factors = ['Satisfaction', 'Engagement', 'Absences']
stats = []

for factor in factors:
    # Get unique values for the current factor and sort them for consistent order
    sorted_values_for_factor = sorted(df_plot[factor].unique())
    for value in sorted_values_for_factor:
        subset = df_plot[df_plot[factor] == value]['PerfScoreID']
        q1, q2, q3 = subset.quantile([0.25, 0.5, 0.75])
        iqr = q3 - q1

        # Calculate whisker bounds
        # Lower whisker: lowest data point within 1.5*IQR of Q1
        lower_whisker_bound = q1 - 1.5 * iqr
        lower_whisker = subset[subset >= lower_whisker_bound].min()

        # Upper whisker: highest data point within 1.5*IQR of Q3
        upper_whisker_bound = q3 + 1.5 * iqr
        upper_whisker = subset[subset <= upper_whisker_bound].max()

        # Handle potential NaNs if subset is empty or all values are outliers
        if pd.isna(lower_whisker):
            lower_whisker = q1
        if pd.isna(upper_whisker):
            upper_whisker = q3

        stats.append({
            'factor_name': factor,
            'factor_value': str(value), # Convert to string for FactorRange
            'category': (factor, str(value)), # Combined category for x-axis
            'q1': q1,
            'q2': q2, # median
            'q3': q3,
            'upper': upper_whisker,
            'lower': lower_whisker
        })

stats_df = pd.DataFrame(stats)

# Define x_range using the combined categories from stats_df
x_range_list = stats_df['category'].tolist()

source_boxplot = ColumnDataSource(stats_df)

# Create the figure
p = figure(title="Predicted vs Actual Performance Density Plot”",
           x_range=FactorRange(*x_range_list),
           tools="pan,box_zoom,reset,hover",
           height=400, width=800)

# Draw the boxes
p.vbar(x='category', top='q3', bottom='q1', width=0.7, source=source_boxplot,
       line_color="black", fill_color="lightblue", name="boxes")

# Draw the median lines
p.segment(x0='category', y0='q2', x1='category', y1='q2', line_color="black", line_width=2, source=source_boxplot, name="medians")

# Draw the whiskers
p.segment(x0='category', y0='upper', x1='category', y1='q3', line_color="black", source=source_boxplot, name="upper_whiskers")
p.segment(x0='category', y0='lower', x1='category', y1='q1', line_color="black", source=source_boxplot, name="lower_whiskers")

# Draw the caps for whiskers
p.vbar(x='category', top='upper', bottom='upper', width=0.2, line_color="black", source=source_boxplot, name="upper_caps")
p.vbar(x='category', top='lower', bottom='lower', width=0.2, line_color="black", source=source_boxplot, name="lower_caps")


# Adjust plot labels and axis titles
p.xaxis.axis_label = "Factors and Values"
p.yaxis.axis_label = "Performance Score"
p.title.text_font_size = "16pt"
p.xaxis.major_label_orientation = "vertical"

# Add HoverTool for boxplot elements
# Note: HoverTool will pick up properties from the source associated with the glyphs.
hover_boxes = HoverTool(tooltips=[
    ("Factor", "@factor_name"),
    ("Value", "@factor_value"),
    ("Q1", "@q1{0.00}"),
    ("Median", "@q2{0.00}"),
    ("Q3", "@q3{0.00}"),
    ("Upper Whisker", "@upper{0.00}"),
    ("Lower Whisker", "@lower{0.00}")
])
p.add_tools(hover_boxes)

# Show the plot
output_notebook()
show(p)

In [None]:
from bokeh.layouts import layout as bokeh_layout, row
from bokeh.io import show
from bokeh.models import Div # For creating text placeholders

# Create a multi-panel interactive Bokeh dashboard


eda_panel = bokeh_layout(
    row(p_eda, eda_table), # Using the 'satisfaction by position' bar chart and data table
    sizing_mode='scale_width'
)

# Group Stakeholder Insight plots together

stakeholder_panel = bokeh_layout(
    [p], # Using the latest defined 'p' which is the boxplot
    sizing_mode='scale_width'
)

# Create the final dashboard layout
# Using a column layout for main sections, rows for sub-sections

# 'p_action' and 'model_eval_layout' were not defined. Using placeholders for these sections.
dashboard = bokeh_layout(
    Div(text="""<h3>Detailed Feature Importance Placeholder</h3><p>Plot 'p_action' was not defined.</p>"""), # Placeholder for p_action
    eda_panel,            # Enhanced EDA plots
    Div(text="""<h3>Model Evaluation Layout Placeholder</h3><p>Layout 'model_eval_layout' was not defined.</p>"""), # Placeholder for model_eval_layout
    stakeholder_panel,    # Stakeholder-specific plots
    sizing_mode='scale_width'
)

# Show the final dashboard
show(dashboard)

print("Generated and displayed the multi-panel interactive Bokeh dashboard.")

NameError: name 'p_eda' is not defined