In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import altair as alt

# Load the dataset
data = pd.read_csv("HR_comma_sep-emp-id.csv")

# Convert 'salary' column to numerical representation
data['salary'] = data['salary'].map({'low': 1, 'medium': 2, 'high': 3})

# One-hot encode the 'sales' column
data = pd.get_dummies(data, columns=['sales'])

# Separate categorical and numerical variables
categorical_vars = ['salary']
numerical_vars = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours',
                  'time_spend_company', 'Work_accident', 'promotion_last_5years']

# Apply one-hot encoding to categorical variables
categorical_df = pd.get_dummies(data[categorical_vars])

# Combine categorical and numerical variables
X = pd.concat([data[numerical_vars], categorical_df], axis=1)
y = data['left']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

# Initialize and train the Random Forest model
model = RandomForestClassifier(random_state=123)
model.fit(X_train, y_train)

# Predict the probability of turnover in the test data
turnover_probability = model.predict_proba(X_test)[:, 1]

# Create a DataFrame with employee IDs from the TEST set and their turnover probabilities
employee_data = pd.DataFrame({'employee_id': X_test.index,
                             'turnover_probability': turnover_probability})

# Categorize employees into risk zones
employee_data['risk_zone'] = pd.cut(employee_data['turnover_probability'],
                                    bins=[0, 0.2, 0.6, 0.9, 1],
                                    labels=['Safe Zone', 'Low-Risk Zone', 'Medium-Risk Zone', 'High-Risk Zone'])

# Print the DataFrame
print(employee_data.head().to_markdown(index=False, numalign="left", stralign="left"))
print(employee_data.info())

# Create a bar chart to visualize the distribution of employees across different risk zones
risk_zone_chart = alt.Chart(employee_data).mark_bar().encode(
    x=alt.X('risk_zone', title='Risk Zone'),
    y=alt.Y('count()', title='Number of Employees'),
    tooltip=[alt.Tooltip('risk_zone', title='Risk Zone'), alt.Tooltip('count()', title='Number of Employees')]
).properties(
    title='Distribution of Employees Across Risk Zones'
).interactive()

# Save the chart
risk_zone_chart.save('employee_risk_zone_distribution.json')
# Display the chart using the `show()` method.
display(risk_zone_chart)

| employee_id   | turnover_probability   | risk_zone     |
|:--------------|:-----------------------|:--------------|
| 10627         | 0                      | nan           |
| 2703          | 0.37                   | Low-Risk Zone |
| 6059          | 0.15                   | Safe Zone     |
| 3258          | 0                      | nan           |
| 4565          | 0                      | nan           |
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   employee_id           3000 non-null   int64   
 1   turnover_probability  3000 non-null   float64 
 2   risk_zone             1706 non-null   category
dtypes: category(1), float64(1), int64(1)
memory usage: 50.1 KB
None


In [13]:
# Add a new column for high-risk employee IDs
employee_data['high_risk_emp_id'] = employee_data.apply(
    lambda row: row['employee_id'] if row['risk_zone'] == 'High-Risk Zone' else None, axis=1
)

# Get the high-risk employee IDs
high_risk_emp_ids = employee_data.loc[employee_data['risk_zone'] == 'High-Risk Zone', 'high_risk_emp_id'].dropna().tolist()

# Create a DataFrame for better readability
high_risk_df = pd.DataFrame({'High-Risk Employee IDs': high_risk_emp_ids})

# Print the DataFrame in a tabular format
print(high_risk_df.to_markdown(index=False, numalign="left", stralign="left"))

| High-Risk Employee IDs   |
|:-------------------------|
| 14892                    |
| 1246                     |
| 14278                    |
| 12238                    |
| 14859                    |
| 12500                    |
| 14703                    |
| 233                      |
| 756                      |
| 14260                    |
| 12268                    |
| 12087                    |
| 172                      |
| 225                      |
| 14601                    |
| 14450                    |
| 12575                    |
| 251                      |
| 14344                    |
| 150                      |
| 14960                    |
| 12521                    |
| 12221                    |
| 14955                    |
| 12389                    |
| 12444                    |
| 12157                    |
| 12726                    |
| 1636                     |
| 12482                    |
| 357                      |
| 12169                    |
| 1268        