In [21]:
import pandas as pd


df = pd.read_csv("absence_dataset.csv", sep=";")
df

Unnamed: 0,Worker_ID,Birth_Year,Gender,Start_Date_Employment,Job_Type,Year,Days_Sickness_Per_Year,Days_Holiday_Per_Year,Frequency_Sick(Days),Frequency_Holiday(Days),Job_Contract,Shift_Work
0,W0001,1986,Female,2015-08-21,Produktion,2015,24,31,5,1,Full-time,No
1,W0001,1986,Female,2015-08-21,Produktion,2016,7,25,4,1,Full-time,No
2,W0001,1986,Female,2015-08-21,Produktion,2017,13,20,3,1,Full-time,No
3,W0001,1986,Female,2015-08-21,Produktion,2018,18,21,1,3,Full-time,No
4,W0001,1986,Female,2015-08-21,Produktion,2019,23,27,2,3,Full-time,No
...,...,...,...,...,...,...,...,...,...,...,...,...
19171,W1601,1968,Female,2015-09-02,Technik,2025,17,29,0,2,Full-time,No
19172,W1602,1996,Female,2022-03-13,Produktion,2022,12,22,1,3,Full-time,No
19173,W1602,1996,Female,2022-03-13,Produktion,2023,13,33,5,1,Full-time,No
19174,W1602,1996,Female,2022-03-13,Produktion,2024,14,28,0,1,Full-time,No


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19176 entries, 0 to 19175
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Worker_ID                19176 non-null  object
 1   Birth_Year               19176 non-null  int64 
 2   Gender                   19176 non-null  object
 3   Start_Date_Employment    19176 non-null  object
 4   Job_Type                 19176 non-null  object
 5   Year                     19176 non-null  int64 
 6   Days_Sickness_Per_Year   19176 non-null  int64 
 7   Days_Holiday_Per_Year    19176 non-null  int64 
 8   Frequency_Sick(Days)     19176 non-null  int64 
 9   Frequency_Holiday(Days)  19176 non-null  int64 
 10  Job_Contract             19176 non-null  object
 11  Shift_Work               19176 non-null  object
dtypes: int64(6), object(6)
memory usage: 1.8+ MB


In [23]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

one_hot = OneHotEncoder(sparse_output=False)
one_hot.set_output(transform="pandas")

job_type_encoded = one_hot.fit_transform(df[["Job_Type"]])

transformed_df = pd.concat([df.reset_index(drop=True), job_type_encoded.reset_index(drop=True)], axis=1)

transformed_df["Gender_Transformed"] = OrdinalEncoder().fit_transform(df[["Gender"]])
transformed_df["Shift_Work_Transformed"] = OrdinalEncoder().fit_transform(df[["Shift_Work"]])
transformed_df["Job_Contract_Transformed"] = OrdinalEncoder().fit_transform(df[["Job_Contract"]])

transformed_df["Start_Date_Employment"] = pd.to_datetime(df["Start_Date_Employment"], errors="coerce")
today = pd.Timestamp.today()
transformed_df["seniority_years"] = df["Year"] - transformed_df["Start_Date_Employment"].dt.year

transformed_df["y"] = ((df["Days_Sickness_Per_Year"]) >= 25).astype(int)
print(transformed_df["y"].sum() / len(df))

transformed_df

0.06711514392991239


Unnamed: 0,Worker_ID,Birth_Year,Gender,Start_Date_Employment,Job_Type,Year,Days_Sickness_Per_Year,Days_Holiday_Per_Year,Frequency_Sick(Days),Frequency_Holiday(Days),...,Job_Type_Pflege,Job_Type_Produktion,Job_Type_Technik,Job_Type_Vertrieb,Job_Type_Verwaltung,Gender_Transformed,Shift_Work_Transformed,Job_Contract_Transformed,seniority_years,y
0,W0001,1986,Female,2015-08-21,Produktion,2015,24,31,5,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,W0001,1986,Female,2015-08-21,Produktion,2016,7,25,4,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
2,W0001,1986,Female,2015-08-21,Produktion,2017,13,20,3,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0
3,W0001,1986,Female,2015-08-21,Produktion,2018,18,21,1,3,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0
4,W0001,1986,Female,2015-08-21,Produktion,2019,23,27,2,3,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19171,W1601,1968,Female,2015-09-02,Technik,2025,17,29,0,2,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,10,0
19172,W1602,1996,Female,2022-03-13,Produktion,2022,12,22,1,3,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
19173,W1602,1996,Female,2022-03-13,Produktion,2023,13,33,5,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
19174,W1602,1996,Female,2022-03-13,Produktion,2024,14,28,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0


In [24]:
transformed_df = transformed_df.sort_values(["Worker_ID", "Year"]).reset_index(drop=True)

lag_cols = ["Days_Sickness_Per_Year", "Days_Holiday_Per_Year", "Frequency_Sick(Days)", "Frequency_Holiday(Days)"]
transformed_df[[c + "_lag1" for c in lag_cols]] = transformed_df.groupby("Worker_ID")[lag_cols].shift(1)
transformed_df[[c + "_lag2" for c in lag_cols]] = transformed_df.groupby("Worker_ID")[lag_cols].shift(2)
transformed_df[[c + "_lag3" for c in lag_cols]] = transformed_df.groupby("Worker_ID")[lag_cols].shift(3)

transformed_df

Unnamed: 0,Worker_ID,Birth_Year,Gender,Start_Date_Employment,Job_Type,Year,Days_Sickness_Per_Year,Days_Holiday_Per_Year,Frequency_Sick(Days),Frequency_Holiday(Days),...,Frequency_Sick(Days)_lag1,Frequency_Holiday(Days)_lag1,Days_Sickness_Per_Year_lag2,Days_Holiday_Per_Year_lag2,Frequency_Sick(Days)_lag2,Frequency_Holiday(Days)_lag2,Days_Sickness_Per_Year_lag3,Days_Holiday_Per_Year_lag3,Frequency_Sick(Days)_lag3,Frequency_Holiday(Days)_lag3
0,W0001,1986,Female,2015-08-21,Produktion,2015,24,31,5,1,...,,,,,,,,,,
1,W0001,1986,Female,2015-08-21,Produktion,2016,7,25,4,1,...,5.0,1.0,,,,,,,,
2,W0001,1986,Female,2015-08-21,Produktion,2017,13,20,3,1,...,4.0,1.0,24.0,31.0,5.0,1.0,,,,
3,W0001,1986,Female,2015-08-21,Produktion,2018,18,21,1,3,...,3.0,1.0,7.0,25.0,4.0,1.0,24.0,31.0,5.0,1.0
4,W0001,1986,Female,2015-08-21,Produktion,2019,23,27,2,3,...,1.0,3.0,13.0,20.0,3.0,1.0,7.0,25.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19171,W1601,1968,Female,2015-09-02,Technik,2025,17,29,0,2,...,1.0,3.0,16.0,26.0,3.0,1.0,7.0,26.0,5.0,1.0
19172,W1602,1996,Female,2022-03-13,Produktion,2022,12,22,1,3,...,,,,,,,,,,
19173,W1602,1996,Female,2022-03-13,Produktion,2023,13,33,5,1,...,1.0,3.0,,,,,,,,
19174,W1602,1996,Female,2022-03-13,Produktion,2024,14,28,0,1,...,5.0,1.0,12.0,22.0,1.0,3.0,,,,


In [25]:
feature_df = transformed_df.copy()
feature_df = feature_df.drop(
    columns=[
        "Worker_ID",
        "Gender",
        "Start_Date_Employment",
        "Job_Type",
        "Job_Contract",
        "Shift_Work",
        "Days_Sickness_Per_Year",
        "Days_Holiday_Per_Year",
        "Frequency_Sick(Days)",
        "Frequency_Holiday(Days)",
    ],
)

feature_df

Unnamed: 0,Birth_Year,Year,Job_Type_IT,Job_Type_Logistik,Job_Type_Pflege,Job_Type_Produktion,Job_Type_Technik,Job_Type_Vertrieb,Job_Type_Verwaltung,Gender_Transformed,...,Frequency_Sick(Days)_lag1,Frequency_Holiday(Days)_lag1,Days_Sickness_Per_Year_lag2,Days_Holiday_Per_Year_lag2,Frequency_Sick(Days)_lag2,Frequency_Holiday(Days)_lag2,Days_Sickness_Per_Year_lag3,Days_Holiday_Per_Year_lag3,Frequency_Sick(Days)_lag3,Frequency_Holiday(Days)_lag3
0,1986,2015,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,1986,2016,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,5.0,1.0,,,,,,,,
2,1986,2017,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,4.0,1.0,24.0,31.0,5.0,1.0,,,,
3,1986,2018,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,3.0,1.0,7.0,25.0,4.0,1.0,24.0,31.0,5.0,1.0
4,1986,2019,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,3.0,13.0,20.0,3.0,1.0,7.0,25.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19171,1968,2025,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,3.0,16.0,26.0,3.0,1.0,7.0,26.0,5.0,1.0
19172,1996,2022,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
19173,1996,2023,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,3.0,,,,,,,,
19174,1996,2024,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,5.0,1.0,12.0,22.0,1.0,3.0,,,,


In [26]:
train = feature_df[(feature_df["Year"] < 2025)].copy()
test = feature_df[(feature_df["Year"] == 2025)].copy()

X = train.drop(columns=["y"])
y = train["y"]

X_test = test.drop(columns=["y"])
y_test = test["y"]

In [27]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

cls = LGBMClassifier(class_weight="balanced")

cls.fit(X, y)

y_pred = cls.predict_proba(X_test)
y_pred_classes = y_pred.argmax(axis=1)

print(confusion_matrix(y_test, y_pred_classes))
print(classification_report(y_test, y_pred_classes))

[LightGBM] [Info] Number of positive: 1170, number of negative: 16404
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000537 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 369
[LightGBM] [Info] Number of data points in the train set: 17574, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[[1301  184]
 [  82   35]]
              precision    recall  f1-score   support

           0       0.94      0.88      0.91      1485
           1       0.16      0.30      0.21       117

    accuracy                           0.83      1602
   macro avg       0.55      0.59      0.56      1602
weighted avg       0.88      0.83      0.86      1602



In [28]:
feature_df = transformed_df.copy()
predictions_df = feature_df[(feature_df["Year"] == 2025)].copy()
predictions_df["risk"] = y_pred_classes
predictions_df.to_csv("./prediction.csv")