# Logistic Regression on cluster_updated_data

## Importing the libraries

In [427]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

## Importing the dataset

In [428]:
# pyspark_df = spark.table("sd_bdc_demo.k_cluster_logistic_regression.2_9_am_workday_cluster_updated_data")
# pyspark_df.display()
# df = pyspark_df.toPandas()

In [429]:
df = pd.read_csv("2_9_am_workday_cluster_updated_data/2_9_am_workday_cluster_updated_data.csv")


In [430]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [431]:
print(df.columns)

Index(['Current_Role', 'Zip_Code', 'Employee_ID', 'Employee_Annual_Salary',
       'Monthly_Medical_contribution', 'Monthly_Dental_Contribution',
       'Monthly_Vision_Contribution', 'Bonus', 'Years_Of_Service',
       'Department', 'Gender', 'Employee_HR_rate', 'Hours_per_week',
       'Years_Since_Last_Promotion', 'age', 'left', 'Cluster_label'],
      dtype='object')


In [432]:
print(df.head())

                Current_Role  Zip_Code  Employee_ID  Employee_Annual_Salary  \
0   Senior Software Engineer     94103           57                  130800   
1   Associate Data Scientist     94111           58                   74250   
2  Associate Product Manager     10009           59                  125850   
3           Business Analyst     60605           60                   84750   
4    Chief Operating Officer      2101           61                  300000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           130                           35   
1                           105                           30   
2                           130                           35   
3                           105                           30   
4                           230                           55   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service    Department  \
0                           19  13865               5.2   Engineeri

In [433]:
print(df.shape)

(100, 17)


## Encoding categorical data

### Encoding the Independent Variable

In [434]:
df_encoded = pd.get_dummies(df, columns=['Current_Role', 'Department', 'Gender'], drop_first=False).astype(int)


In [435]:
print(df_encoded.head())

   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0     94103           57                  130800   
1     94111           58                   74250   
2     10009           59                  125850   
3     60605           60                   84750   
4      2101           61                  300000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           130                           35   
1                           105                           30   
2                           130                           35   
3                           105                           30   
4                           230                           55   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           19  13865                 5                60   
1                           17   4344                 1               117   
2                           19  15920                 1               1

In [436]:
print(df_encoded.shape)

(100, 106)


## Split two Data Frame for Independent Features & Dependent Columns

In [437]:
df_independent = df_encoded.drop(columns=['left'])
X = df_independent
df_dependent = df_encoded[['left']]
y = df_dependent

In [438]:
print(df_independent.head())


   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0     94103           57                  130800   
1     94111           58                   74250   
2     10009           59                  125850   
3     60605           60                   84750   
4      2101           61                  300000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           130                           35   
1                           105                           30   
2                           130                           35   
3                           105                           30   
4                           230                           55   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           19  13865                 5                60   
1                           17   4344                 1               117   
2                           19  15920                 1               1

In [439]:
print(df_independent.shape)

(100, 105)


In [440]:
print(df_dependent.head())

   left
0     0
1     0
2     0
3     0
4     1


In [441]:
print(df_dependent.shape)

(100, 1)


## Splitting the dataset into the Training set and Test set

In [442]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [443]:
print(X_train.head())

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
2      10009           59                  125850   
73     90011          130                  136000   
97     78712          154                   88100   
62     98102          119                  190000   
19      2108           76                  220000   

    Monthly_Medical_contribution  Monthly_Dental_Contribution  \
2                            130                           35   
73                           130                           35   
97                           105                           30   
62                           155                           40   
19                           180                           45   

    Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
2                            19  15920                 1               147   
73                           19  15300                 6                68   
97                           17   5594                 2

In [444]:
print(X_train.shape)

(80, 105)


In [445]:
print(X_test.head())

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
80      2104          137                  139600   
84     98102          141                  147200   
33      2107           90                  141600   
81     10004          138                  133200   
93     94112          150                  149000   

    Monthly_Medical_contribution  Monthly_Dental_Contribution  \
80                           130                           35   
84                           130                           35   
33                           130                           35   
81                           130                           35   
93                           130                           35   

    Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
80                           19  16333                 7                82   
84                           19  18621                 9               130   
33                           19  20390                 4

In [446]:
print(X_test.shape)

(20, 105)


In [447]:
print(y_train.head())

    left
2      0
73     1
97     0
62     1
19     1


In [448]:
print(y_train.shape)

(80, 1)


In [449]:
print(y_test.head())

    left
80     0
84     1
33     0
81     0
93     0


In [450]:
print(y_test.shape)

(20, 1)


## Feature Scaling

In [451]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# List of columns to scale
columns_to_scale = [
    'Employee_Annual_Salary', 'Monthly_Medical_contribution',
    'Monthly_Dental_Contribution', 'Monthly_Vision_Contribution', 'Bonus',
    'Years_Of_Service', 'Employee_HR_rate', 'Hours_per_week',
    'Years_Since_Last_Promotion', 'age'
]

# Fit sc on training data and transform both training and test sets
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[columns_to_scale] = sc.fit_transform(X_train_scaled[columns_to_scale])
X_test_scaled[columns_to_scale] = sc.transform(X_test_scaled[columns_to_scale])


In [452]:
print(X_train_scaled)

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
2      10009           59               -0.078349   
73     90011          130                0.108960   
97     78712          154               -0.774991   
62     98102          119                1.105480   
19      2108           76                1.659103   
..       ...          ...                     ...   
75     60604          132                0.027762   
9      10002           66               -0.011914   
72     90008          129                0.108960   
12     94104           69               -0.910628   
37     94110           94               -1.210507   

    Monthly_Medical_contribution  Monthly_Dental_Contribution  \
2                       0.030762                     0.030762   
73                      0.030762                     0.030762   
97                     -0.789546                    -0.789546   
62                      0.851069                     0.851069   
19                      1.671377      

In [453]:
print(X_test_scaled)

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
80      2104          137                0.175395   
84     98102          141                0.315646   
33      2107           90                0.212303   
81     10004          138                0.057288   
93     94112          150                0.348863   
17     94112           74               -0.691947   
36     94111           93               -0.961377   
82     10004          139                0.027762   
69     90010          126                1.470871   
65     10010          122                0.319336   
92     94103          149                1.659103   
39     78731           96               -1.205894   
56     98101          113               -0.850652   
52     10003          109                0.120955   
51     10005          108               -1.168063   
32      2107           89               -1.127464   
31      2107           88               -1.186517   
44     94112          101                0.297

### Removing `Employee ID` & `Zip Code` in `X_train_scaled` & `X_test_scaled`

In [454]:
print(type(X_train_scaled))

<class 'pandas.core.frame.DataFrame'>


In [455]:
X_train_scaled_new = X_train_scaled.drop(columns=['Employee_ID', 'Zip_Code'])
print(X_train_scaled_new)

    Employee_Annual_Salary  Monthly_Medical_contribution  \
2                -0.078349                      0.030762   
73                0.108960                      0.030762   
97               -0.774991                     -0.789546   
62                1.105480                      0.851069   
19                1.659103                      1.671377   
..                     ...                           ...   
75                0.027762                      0.030762   
9                -0.011914                      0.030762   
72                0.108960                      0.030762   
12               -0.910628                     -0.789546   
37               -1.210507                     -0.789546   

    Monthly_Dental_Contribution  Monthly_Vision_Contribution     Bonus  \
2                      0.030762                     0.030762 -0.132665   
73                     0.030762                     0.030762 -0.165009   
97                    -0.789546                    -0.789

In [456]:
print(X_train_scaled_new['Cluster_label'])

2     4
73    3
97    0
62    1
19    1
     ..
75    3
9     0
72    3
12    0
37    0
Name: Cluster_label, Length: 80, dtype: int64


In [457]:
print(type(X_test_scaled))

<class 'pandas.core.frame.DataFrame'>


In [458]:
X_test_scaled_new = X_test_scaled.drop(columns=['Employee_ID', 'Zip_Code'])
print(X_test_scaled_new)

    Employee_Annual_Salary  Monthly_Medical_contribution  \
80                0.175395                      0.030762   
84                0.315646                      0.030762   
33                0.212303                      0.030762   
81                0.057288                      0.030762   
93                0.348863                      0.030762   
17               -0.691947                     -0.789546   
36               -0.961377                     -0.789546   
82                0.027762                      0.030762   
69                1.470871                      1.671377   
65                0.319336                      0.030762   
92                1.659103                      1.671377   
39               -1.205894                     -0.789546   
56               -0.850652                     -0.789546   
52                0.120955                      0.030762   
51               -1.168063                     -0.789546   
32               -1.127464              

In [459]:
print(X_test_scaled_new['Cluster_label'])

80    3
84    3
33    3
81    3
93    3
17    0
36    4
82    3
69    1
65    3
92    1
39    4
56    4
52    0
51    4
32    4
31    4
44    3
78    3
10    4
Name: Cluster_label, dtype: int64


## Training the Model

In [460]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [461]:
training_timestamp = datetime.now().strftime("%m/%d/%Y %H:%M")
print(training_timestamp)
model.fit(X_train_scaled_new, y_train.values.ravel())

06/19/2025 19:29


## Prediction

In [462]:
y_pred = model.predict(X_test_scaled_new)

In [463]:
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]


In [464]:
y_pred_prob = model.predict_proba(X_test_scaled_new)

In [465]:
print(y_pred_prob)

[[8.99858290e-01 1.00141710e-01]
 [5.74086077e-01 4.25913923e-01]
 [9.79297253e-01 2.07027472e-02]
 [9.84271325e-01 1.57286750e-02]
 [8.44321166e-01 1.55678834e-01]
 [9.98582951e-01 1.41704944e-03]
 [9.99848079e-01 1.51921495e-04]
 [9.98566029e-01 1.43397122e-03]
 [8.90502346e-01 1.09497654e-01]
 [6.66353772e-01 3.33646228e-01]
 [3.47401373e-01 6.52598627e-01]
 [9.99898798e-01 1.01202075e-04]
 [9.98873435e-01 1.12656482e-03]
 [9.87704704e-01 1.22952957e-02]
 [9.99796000e-01 2.04000264e-04]
 [9.98612596e-01 1.38740445e-03]
 [9.99553289e-01 4.46711466e-04]
 [9.72538986e-01 2.74610143e-02]
 [8.04403440e-01 1.95596560e-01]
 [9.99843651e-01 1.56348771e-04]]


### Saving the output as new table

In [466]:
# Get Employee_IDs from X_test
employee_ids = X_test_scaled['Employee_ID'].values
Cluster_label = X_test_scaled['Cluster_label'].values

In [467]:
print("employee_ids shape:", employee_ids.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)
print("Cluster_label:", Cluster_label.shape)


employee_ids shape: (20,)
y_test shape: (20, 1)
y_pred shape: (20,)
Cluster_label: (20,)


In [468]:
print(y_test)

    left
80     0
84     1
33     0
81     0
93     0
17     0
36     0
82     0
69     0
65     1
92     1
39     0
56     0
52     0
51     0
32     0
31     0
44     0
78     0
10     0


In [469]:
# Flatten y_test to 1D
y_test = y_test.values.ravel()

In [470]:
print("employee_ids shape:", employee_ids.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)
print("Cluster_label:", Cluster_label.shape)

employee_ids shape: (20,)
y_test shape: (20,)
y_pred shape: (20,)
Cluster_label: (20,)


In [471]:
print(y_test)

[0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0]


### get top 3 features

In [472]:
top_indices = np.argsort(model.coef_[0])[::-1][:3]


In [473]:
print(top_indices)

[ 8  5 64]


In [474]:
top_features = X_train_scaled_new.columns[top_indices].tolist()

In [475]:
results_df = pd.DataFrame({
    'Employee_ID': employee_ids,
    'Actual': y_test,
    'Predicted': y_pred,
    'Cluster_label': Cluster_label,
    'primary_attrition_feature': top_features[0],
    'secondary_attrition_feature': top_features[1],
    'tertiary_attrition_feature': top_features[2]
})



In [476]:
# Create the folder if it doesn't exist
os.makedirs("3_5_am_workday_log_reg_prediction_data", exist_ok=True)

# Save the Excel file inside the folder
results_df.to_csv("3_5_am_workday_log_reg_prediction_data/3_5_am_workday_log_reg_prediction_data.csv", index=False)


In [477]:
# spark_df = spark.createDataFrame(results_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.3_5_am_workday_log_reg_prediction_data")

## Accuracy

In [478]:
model.score(X_test_scaled_new,y_test)

0.9

In [479]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Confusion Matrix:
 [[17  0]
 [ 2  1]]
Accuracy: 0.9
Precision: 1.0
Recall: 0.3333333333333333
F1 Score: 0.5


### saving the metrics

In [480]:
Accuracy = accuracy_score(y_test, y_pred)
Precision = precision_score(y_test, y_pred)
Recall = recall_score(y_test, y_pred) 
F1_Score = f1_score(y_test, y_pred)

# Create a DataFrame
# metrics_df = pd.DataFrame({
#     "Metric": ["Accuracy", "Precision", "Recall", "F1_Score"],
#     "Value": [Accuracy, Precision, Recall, F1_Score]
# })


In [481]:
# Model Run Id
model_type_abbr = "lr"
project_name = "attrition_prediction"
timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
model_run_id = f"{model_type_abbr}_{project_name}_{timestamp_str}"

# Model Type
model_type = "logistic regression"

# model version 
version_file = "3_6_am_workday_log_reg_metrics_data.csv"
if os.path.exists(version_file):
    versions_df = pd.read_csv(version_file)
    max_version = versions_df[versions_df["model_type"] == model_type]["model_version"].max()
    model_version = int(max_version) + 1
else:
    model_version = 1

# training_timestamp
training_timestamp = training_timestamp

# dataset size
dataset_size = X_test_scaled_new.shape[0]

# feature count
feature_count = X_test_scaled_new.shape[1] 

# ==== Create metrics DataFrame ====
metrics_df = pd.DataFrame([{
    "model_run_id": model_run_id,
    "model_type": model_type,
    "model_version": model_version,
    "training_timestamp": training_timestamp,
    "dataset_size": dataset_size,
    "feature_count": feature_count,
    "classification_accuracy": Accuracy,
    "Precision": Precision,
    "Recall": Recall,
    "F1_Score": F1_Score
}])




In [482]:
# Directory & file path
folder = "3_6_am_workday_log_reg_metrics_data"
os.makedirs(folder, exist_ok=True)

# Full path to the CSV file
version_file = os.path.join(folder, "3_6_am_workday_log_reg_metrics_data.csv")

# Append the full metrics row to the CSV log (create file if it doesn't exist)
metrics_df.to_csv(
    version_file,
    mode='a',
    header=not os.path.exists(version_file),
    index=False
)

In [483]:
# # Create the folder if it doesn't exist
# os.makedirs("3_6_am_workday_log_reg_metrics_data", exist_ok=True)

# # Save the Excel file inside the folder
# metrics_df.to_csv("3_6_am_workday_log_reg_metrics_data/3_6_am_workday_log_reg_metrics_data.csv", index=False)


In [484]:
# spark_df = spark.createDataFrame(metrics_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.3_6_am_workday_log_reg_metrics_data")

## Bias & Weights

In [485]:
print(model.coef_)       # β₁, β₂, ..., βn
print(model.intercept_)  # β₀

[[ 1.78437022e-01 -5.02663291e-02 -5.02663291e-02 -5.02663291e-02
   1.06206193e-01  5.77667502e-01 -2.81764872e-01  0.00000000e+00
   2.24738346e+00 -3.16780821e-01 -2.90877297e-01  2.16504169e-04
   3.28716300e-04 -1.29465455e-03  1.60854274e-01 -2.69956360e-01
  -3.41710230e-03 -4.53338988e-03  2.02794951e-04  0.00000000e+00
  -5.86854858e-03 -9.82823443e-04  5.27565467e-02  0.00000000e+00
   1.72805894e-01  2.54890127e-01 -9.86674336e-02 -6.21984028e-02
  -1.93207809e-02 -2.28699575e-03 -2.55638115e-02  0.00000000e+00
  -1.50232526e-03 -1.14416209e-02  0.00000000e+00  3.25219295e-04
   0.00000000e+00  3.24591532e-04  2.11237688e-04 -5.60320378e-02
   0.00000000e+00 -2.14104351e-02 -1.14218726e-01 -9.19716521e-03
  -1.76160208e-02 -1.13396431e-01  0.00000000e+00  0.00000000e+00
  -2.98832908e-03 -5.37674225e-03  3.39767228e-04  2.40819568e-01
  -4.24836530e-02 -1.14968181e-01  2.34470464e-01 -8.98658430e-03
  -1.04964340e-01  0.00000000e+00 -2.46891926e-03 -3.54178673e-03
  -9.00483

In [486]:
len(model.coef_.ravel())

103

In [487]:
max(model.coef_.ravel())

np.float64(2.2473834622100317)

In [488]:
feature_weights = pd.Series(model.coef_[0], index=X_train_scaled_new.columns)
print(feature_weights)


Employee_Annual_Salary          0.178437
Monthly_Medical_contribution   -0.050266
Monthly_Dental_Contribution    -0.050266
Monthly_Vision_Contribution    -0.050266
Bonus                           0.106206
                                  ...   
Department_Product             -0.457104
Department_Sales                0.132063
Gender_Female                   0.277160
Gender_Male                    -0.186901
Gender_Non-Binary              -0.080462
Length: 103, dtype: float64


### Saving All Feature & Weights

In [489]:
weights_df = pd.DataFrame({
    'Feature': X_train_scaled_new.columns,
    'Weight': model.coef_[0] 
})

print(weights_df)

                          Feature    Weight
0          Employee_Annual_Salary  0.178437
1    Monthly_Medical_contribution -0.050266
2     Monthly_Dental_Contribution -0.050266
3     Monthly_Vision_Contribution -0.050266
4                           Bonus  0.106206
..                            ...       ...
98             Department_Product -0.457104
99               Department_Sales  0.132063
100                 Gender_Female  0.277160
101                   Gender_Male -0.186901
102             Gender_Non-Binary -0.080462

[103 rows x 2 columns]


In [490]:
# Create the folder if it doesn't exist
os.makedirs("3_1_am_workday_all_feature_vs_weights_data", exist_ok=True)

# Save the Excel file inside the folder
weights_df.to_csv("3_1_am_workday_all_feature_vs_weights_data/3_1_am_workday_all_feature_vs_weights_data.csv", index=False)

In [491]:
# spark_df = spark.createDataFrame(weights_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.3_1_am_workday_all_feature_vs_weights_data")

### Saving Current_Role & Weights

In [492]:
# pyspark_df = spark.table("sd_bdc_demo.k_cluster_logistic_regression.3_1_am_workday_all_feature_vs_weights_data")
# pyspark_df.display()
# all_feature_df = pyspark_df.toPandas()

In [493]:
all_feature_df = pd.read_csv("3_1_am_workday_all_feature_vs_weights_data/3_1_am_workday_all_feature_vs_weights_data.csv")

In [494]:
print(all_feature_df.head())

                        Feature    Weight
0        Employee_Annual_Salary  0.178437
1  Monthly_Medical_contribution -0.050266
2   Monthly_Dental_Contribution -0.050266
3   Monthly_Vision_Contribution -0.050266
4                         Bonus  0.106206


In [495]:
all_feature_df_Current_Role = all_feature_df[all_feature_df['Feature'].str.startswith('Current_Role_')]
print(all_feature_df_Current_Role)


                                   Feature    Weight
11   Current_Role_Associate Data Scientist  0.000217
12  Current_Role_Associate Product Manager  0.000329
13           Current_Role_Business Analyst -0.001295
14    Current_Role_Chief Operating Officer  0.160854
15   Current_Role_Chief Technology Officer -0.269956
..                                     ...       ...
83             Current_Role_Technical Lead  0.000000
84           Current_Role_Technical Writer  0.000552
85                Current_Role_UX Designer -0.011010
86              Current_Role_UX Researcher -0.002737
87           Current_Role_VP of Technology -0.117714

[77 rows x 2 columns]


In [496]:
Current_Role_df = all_feature_df_Current_Role.copy()

In [497]:
Current_Role_df['Feature'] = Current_Role_df['Feature'].str.removeprefix('Current_Role_')


In [498]:
print(Current_Role_df)

                      Feature    Weight
11   Associate Data Scientist  0.000217
12  Associate Product Manager  0.000329
13           Business Analyst -0.001295
14    Chief Operating Officer  0.160854
15   Chief Technology Officer -0.269956
..                        ...       ...
83             Technical Lead  0.000000
84           Technical Writer  0.000552
85                UX Designer -0.011010
86              UX Researcher -0.002737
87           VP of Technology -0.117714

[77 rows x 2 columns]


In [499]:
# Create the folder if it doesn't exist
os.makedirs("3_2_am_workday_current_role_vs_weights_data", exist_ok=True)

# Save the Excel file inside the folder
Current_Role_df.to_csv("3_2_am_workday_current_role_vs_weights_data/3_2_am_workday_current_role_vs_weights_data.csv", index=False)

In [500]:
# spark_df = spark.createDataFrame(Current_Role_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.3_2_am_workday_current_role_vs_weights_data")

### Saving Department & Weights

In [501]:
all_feature_df_Department = all_feature_df[all_feature_df['Feature'].str.startswith('Department_')]
print(all_feature_df_Department)


                        Feature    Weight
88  Department_Customer Support  0.000203
89      Department_Data Science -0.062342
90            Department_Design -0.134357
91       Department_Engineering  0.107458
92         Department_Executive -0.226816
93           Department_Finance  0.282903
94   Department_Human Resources  0.134298
95                Department_IT  0.000155
96         Department_Marketing -0.016517
97        Department_Operations  0.249853
98           Department_Product -0.457104
99             Department_Sales  0.132063


In [502]:
Department_df = all_feature_df_Department.copy()

In [503]:
Department_df['Feature'] = Department_df['Feature'].str.removeprefix('Department_')

In [504]:
# Create the folder if it doesn't exist
os.makedirs("3_3_am_workday_department_vs_weights_data", exist_ok=True)

# Save the Excel file inside the folder
Department_df.to_csv("3_3_am_workday_department_vs_weights_data/3_3_am_workday_department_vs_weights_data.csv", index=False)

In [505]:
# spark_df = spark.createDataFrame(Department_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.3_3_am_workday_department_vs_weights_data")

### Saving Gender & Weights

In [506]:
all_feature_df_Gender = all_feature_df[all_feature_df['Feature'].str.startswith('Gender_')]
print(all_feature_df_Gender)


               Feature    Weight
100      Gender_Female  0.277160
101        Gender_Male -0.186901
102  Gender_Non-Binary -0.080462


In [507]:
Gender_df = all_feature_df_Gender.copy()

In [508]:
Gender_df['Feature'] = Gender_df['Feature'].str.removeprefix('Gender_')

In [509]:
# Create the folder if it doesn't exist
os.makedirs("3_4_am_workday_gender_vs_weights_data", exist_ok=True)

# Save the Excel file inside the folder
Gender_df.to_csv("3_4_am_workday_gender_vs_weights_data/3_4_am_workday_gender_vs_weights_data.csv", index=False)

In [510]:
# spark_df = spark.createDataFrame(Gender_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.3_4_am_workday_gender_vs_weights_data")