In [6]:
# IBM_HR_Attrition_Rate_Analytics
# EDA Analysis

pip install feature-engine

import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import lightgbm as lgb

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

from feature_engine.encoding import DecisionTreeEncoder, CountFrequencyEncoder, WoEEncoder, RareLabelEncoder

# To set display options
pd.options.display.max_columns = 300
pd.options.display.max_rows = 300

warnings.simplefilter(action='ignore', category=FutureWarning)

# Data Loading 
df = pd.read_csv("../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()
# Data View
df.info()
df.dtypes
df.isna().sum()
df.duplicated().sum()
df.shape
numeric_columns = list(df.select_dtypes(include=np.number).columns)
categorical_columns = list(df.select_dtypes(include="object").columns)
df[numeric_columns].describe()
df[categorical_columns].describe()

#Univariate Plot

fig, ax = plt.subplots(3,3, figsize = (20,10))

ax = np.ravel(ax)
for i in range(len(categorical_columns)):
    sns.countplot(data = df, x = categorical_columns[i], ax = ax[i], palette="rocket", order = df[categorical_columns[i]].value_counts().index)
    ax[i].set_xticklabels(labels = df[categorical_columns[i]].unique(), rotation=45, ha='right')
ax = np.reshape(ax, (3, 3))
plt.tight_layout()

"""Over18 only have one Value
 Attrition is heavily unbalanced
 Most of the employees travel rarely
 Most of the employees work overtimes
 There are more females than males"""

def make_distplot(df, col, ax):
    sns.distplot(df[col], ax = ax)
    ax.axvline(df[col].mean(), linestyle = '--', color = "red")
    ax.axvline(df[col].median(), linestyle = '--', color = "green")

fig, ax = plt.subplots(6,5, figsize = (20,15))

ax = np.ravel(ax)
for i in range(len(numeric_columns)):
    make_distplot(df, numeric_columns[i], ax[i])
for i in range(len(numeric_columns), 6*5):
    ax[i].axis("off")
ax = np.reshape(ax, (6, 5))
plt.tight_layout()

"""There are employees which work for over 10 years without a promotion which is quite strange
 The performance rating of the employees isn't the best with ~ 3.0 as a median """

#Bivariate plots

corr = df[numeric_columns].corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(10,10))
cmap = sns.color_palette("icefire", as_cmap=True)

sns.heatmap(corr, mask = mask, cmap = cmap, annot=False, fmt= '.2f', vmin=-1, vmax=1, square = True, linewidth=2, cbar_kws={"shrink": 0.7}, ax=ax)

# correlations among the numerical columns
"""High correlation between Age and JobLevel, MonthlyIncome and TotalWorkingYears
High correlation between JobLevel and MonthlyIncome
High correlation between TotalWorkingYears and YearsSinceLastPromotion -> The longer you are working, the higher is the probability that you reached your "limit" in terms of promotion"""

fig, ax = plt.subplots(3,3, figsize = (20,10))

ax = np.ravel(ax)
for i in range(len(categorical_columns)):
    sns.countplot(data = df, x = categorical_columns[i], hue="Attrition", ax = ax[i], palette="rocket", order = df[categorical_columns[i]].value_counts().index)
    ax[i].set_xticklabels(labels = df[categorical_columns[i]].unique(), rotation=45, ha='right')
ax = np.reshape(ax, (3, 3))
plt.tight_layout()

"""Nearly half of the Sales Representatives went from the company away
Most of the people were traveling rarely and Sales is the department with the highest attrition (but also with the most employees overall)"""

df.groupby("Gender")["YearsSinceLastPromotion"].mean()

df.groupby("Gender")["PerformanceRating"].mean()

Females are waiting a bit longer for a promotion than men, although having a better performance rating

# Data Preprocessing

# Drop unnecessary columns

df = df.drop(["Over18", "EmployeeCount", "StandardHours"], axis=1)
X = df.loc[:, df.columns != "Attrition"].copy()
y = df.loc[:, "Attrition"].copy()

# Label Encoding for Target Attrition

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.DataFrame(y, index=df.index, columns=["Attrition"])
categorical_cols_features = list(X.select_dtypes(include="object").columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True)

# Rare Label Encoding

# Grouping Labels which have a low amount of observations. They will go into a new category named "Rare" to prevent overfitting

def rare_encoding(X_train, X_test):
    rare_encoder = RareLabelEncoder(
                    tol = 0.05, # Threshold: Labels with a frequency < tol will be grouped
                    n_categories = 4,  # minimum categories required for encoding
                    variables = categorical_cols_features)
    
    rare_encoder.fit(X_train)
    X_train_rare_encoded = rare_encoder.transform(X_train)
    X_test_rare_encoded = rare_encoder.transform(X_test)
    
    return X_train_rare_encoded, X_test_rare_encoded

X_train_rare, X_test_rare = rare_encoding(X_train, X_test)

X_train["EducationField"].value_counts(normalize=True)

X_train[X_train["EducationField"] == "Human Resources"]["EducationField"].head()

X_train_rare[(X_train_rare["EducationField"] == 'Rare')]["EducationField"]

# Now that we have done Rare Encoding Need to look at the other techniques other than Ordinal Encoding or One Hot Encoding

"""First Need to write our function for the three different techniques:
Frequency Encoding
Decision Tree Encoding
Weight of Evidence (WoE) Encoding"""

def encoding(X_train, X_test, y_train, method=None):
    if method:
        if method == "frequency":
            freq_encoder = CountFrequencyEncoder(encoding_method='frequency',variables=categorical_cols_features)
            freq_encoder.fit(X_train)
            X_train_encoded = freq_encoder.transform(X_train)
            X_test_encoded = freq_encoder.transform(X_test)
            
        elif method == "decisiontree":
            dt_encoder = DecisionTreeEncoder(
            variables=categorical_cols_features,
            encoding_method='arbitrary', 
            cv=3,
            scoring='f1',
            param_grid={'max_depth': [i for i in range (3, 7)]}, # Grid search parameters
            regression=False) 
            
            dt_encoder.fit(X_train, y_train)
            X_train_encoded = dt_encoder.transform(X_train)
            X_test_encoded = dt_encoder.transform(X_test)
            
        elif method == "woe":
            woe_encoder = WoEEncoder(variables=categorical_cols_features)
            woe_encoder.fit(X_train, y_train.iloc[:, 0]) 
            X_train_encoded = woe_encoder.transform(X_train)
            X_test_encoded = woe_encoder.transform(X_test)
            
        return X_train_encoded, X_test_encoded
    
    # Frequency Encoding
    
    """Categories will be replaced by the percentage of observations per category. \
        E.g. Category Life Sciences have a proportion of 0.41 so Life Sciences will be replaced by 0.41"""

X_train_freq_encoded, X_test_freq_encoded = encoding(X_train, X_test, y_train, method="frequency")

X_train_freq_encoded[categorical_cols_features].head()

# Decision Tree Encoding
"""The Decision Tree Encoder encodes categorical variables with predictions of a decision tree model. The encoder fits 
a Decision Tree with a single feature and the target,and then replaces the original categories by the predictions"""

X_train_dt_encoded, X_dt_freq_encoded = encoding(X_train, X_test, y_train, method="decisiontree")
X_train_dt_encoded[categorical_cols_features].head()

# WoE Encoding

"""WoE is calculated by taking the natural logarithm of division of % of non-events and % of events

WOE = In(% of non-events ➗ % of events)"""

y_train.iloc[:, 0]

X_train_woe_encoded, X_test_woe_encoded = encoding(X_train, X_test, y_train, method="woe")

X_train_woe_encoded.head()

#Evaluate the three different techniques

scoring = ["roc_auc", "f1", "balanced_accuracy"]

# With Frequency Encoding

model_1 = LogisticRegression(random_state = 42, solver='liblinear')

scores_1 = cross_validate(model_1, X_train_freq_encoded, y_train.values.ravel(), scoring = scoring, cv=3)

scores_1["test_f1"].mean()

# With Decision Tree Encoding

model_2 = LogisticRegression(random_state = 42, solver='liblinear')

scores_2 = cross_validate(model_2, X_train_dt_encoded, y_train.values.ravel(), scoring = scoring, cv=3)

scores_2["test_f1"].mean()

# With WoE Encoding

model_3 = LogisticRegression(random_state = 42, solver='liblinear')

scores_3 = cross_validate(model_3, X_train_woe_encoded, y_train.values.ravel(), scoring = scoring, cv=3)

scores_3["test_f1"].mean()





SyntaxError: invalid syntax (3509433225.py, line 4)