# Logistic Regression on cluster_3_updated_data


## Importing the libraries

In [167]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [168]:
# pyspark_df = spark.table("sd_bdc_demo.k_cluster_logistic_regression.2_7_am_workday_cluster_3_updated_data")
# pyspark_df.display()
# df = pyspark_df.toPandas()

In [169]:
df = pd.read_csv("2_7_am_workday_cluster_3_updated_data/2_7_am_workday_cluster_3_updated_data.csv")


In [170]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [171]:
print(df.columns)

Index(['Current_Role', 'Zip_Code', 'Employee_ID', 'Employee_Annual_Salary',
       'Monthly_Medical_contribution', 'Monthly_Dental_Contribution',
       'Monthly_Vision_Contribution', 'Bonus', 'Years_Of_Service',
       'Department', 'Gender', 'Employee_HR_rate', 'Hours_per_week',
       'Years_Since_Last_Promotion', 'age', 'left'],
      dtype='object')


In [172]:
print(df.head())

           Current_Role  Zip_Code  Employee_ID  Employee_Annual_Salary  \
0            HR Manager      2107           90                  141600   
1            HR Manager      2107           91                  141600   
2    Lead Data Engineer     94108          100                  146600   
3   Lead Data Scientist     94112          101                  146200   
4  Lead DevOps Engineer     98106          102                  146600   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           130                           35   
1                           130                           35   
2                           130                           35   
3                           130                           35   
4                           130                           35   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service       Department  \
0                           19  20390               4.8  Human Resources   
1                 

In [173]:
print(df.shape)

(33, 16)


## Encoding categorical data

### Encoding the Independent Variable

In [174]:
df_encoded = pd.get_dummies(df, columns=['Current_Role', 'Department', 'Gender'], drop_first=False).astype(int)


In [175]:
print(df_encoded.head())

   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0      2107           90                  141600   
1      2107           91                  141600   
2     94108          100                  146600   
3     94112          101                  146200   
4     98106          102                  146600   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           130                           35   
1                           130                           35   
2                           130                           35   
3                           130                           35   
4                           130                           35   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           19  20390                 4               115   
1                           19  20390                 4               155   
2                           19  17519                 7               1

In [176]:
print(df_encoded.shape)

(33, 52)


## Split two Data Frame for Independent Features & Dependent Columns

In [177]:
df_independent = df_encoded.drop(columns=['left'])
X = df_independent
df_dependent = df_encoded[['left']]
y = df_dependent

In [178]:
print(df_independent.head())


   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0      2107           90                  141600   
1      2107           91                  141600   
2     94108          100                  146600   
3     94112          101                  146200   
4     98106          102                  146600   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           130                           35   
1                           130                           35   
2                           130                           35   
3                           130                           35   
4                           130                           35   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           19  20390                 4               115   
1                           19  20390                 4               155   
2                           19  17519                 7               1

In [179]:
print(df_independent.shape)

(33, 51)


In [180]:
print(df_dependent.head())

   left
0     0
1     0
2     0
3     0
4     0


In [181]:
print(df_dependent.shape)

(33, 1)


## Splitting the dataset into the Training set and Test set

In [182]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [183]:
print(X_train.head())

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
25     10010          140                  138800   
24     10004          139                  131600   
30     78704          145                  132800   
17     60604          132                  131600   
22      2104          137                  139600   

    Monthly_Medical_contribution  Monthly_Dental_Contribution  \
25                           130                           35   
24                           130                           35   
30                           130                           35   
17                           130                           35   
22                           130                           35   

    Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
25                           19  16101                 7                88   
24                           19  14081                 5               152   
30                           19  14409                 5

In [184]:
print(X_train.shape)

(26, 51)


In [185]:
print(X_test.head())

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
14     90008          129                  136000   
19     94108          134                  143600   
3      94112          101                  146200   
27     90008          142                  134800   
31     94108          147                  115250   

    Monthly_Medical_contribution  Monthly_Dental_Contribution  \
14                           130                           35   
19                           130                           35   
3                            130                           35   
27                           130                           35   
31                           130                           35   

    Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
14                           19  15300                 6               156   
19                           19  17519                 8                94   
3                            19  17398                 7

In [186]:
print(X_test.shape)

(7, 51)


In [187]:
print(y_train.head())

    left
25     0
24     0
30     0
17     0
22     0


In [188]:
print(y_train.shape)

(26, 1)


In [189]:
print(y_test.head())

    left
14     0
19     0
3      0
27     0
31     0


In [190]:
print(y_test.shape)

(7, 1)


## Feature Scaling

In [191]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# List of columns to scale
columns_to_scale = [
    'Employee_Annual_Salary', 'Monthly_Medical_contribution',
    'Monthly_Dental_Contribution', 'Monthly_Vision_Contribution', 'Bonus',
    'Years_Of_Service', 'Employee_HR_rate', 'Hours_per_week',
    'Years_Since_Last_Promotion', 'age'
]

# Fit sc on training data and transform both training and test sets
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[columns_to_scale] = sc.fit_transform(X_train_scaled[columns_to_scale])
X_test_scaled[columns_to_scale] = sc.transform(X_test_scaled[columns_to_scale])


In [192]:
print(X_train_scaled)

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
25     10010          140                0.042052   
24     10004          139               -0.690234   
30     78704          145               -0.568187   
17     60604          132               -0.690234   
22      2104          137                0.123417   
4      98106          102                0.835362   
2      94108          100                0.835362   
21     60602          136                0.815020   
23     10004          138               -0.527504   
10     10010          122                0.916727   
29     90008          144               -0.161361   
28     90008          143               -0.120679   
18     10004          133               -0.812282   
6      98106          105                0.591266   
13     90008          128               -0.324091   
7      78704          106                0.265806   
32     94112          150                1.079457   
1       2107           91                0.326

In [193]:
print(X_test_scaled)

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
14     90008          129               -0.242726   
19     94108          134                0.530242   
3      94112          101                0.794679   
27     90008          142               -0.364774   
31     94108          147               -2.353134   
26     98102          141                0.896385   
20     60607          135               -0.364774   

    Monthly_Medical_contribution  Monthly_Dental_Contribution  \
14                           0.2                          0.2   
19                           0.2                          0.2   
3                            0.2                          0.2   
27                           0.2                          0.2   
31                           0.2                          0.2   
26                           0.2                          0.2   
20                           0.2                          0.2   

    Monthly_Vision_Contribution     Bonus  Years_Of_Se

### Removing `Employee ID` & `Zip Code` in `X_train_scaled` & `X_test_scaled`

In [194]:
print(type(X_train_scaled))

<class 'pandas.core.frame.DataFrame'>


In [195]:
X_train_scaled_new = X_train_scaled.drop(columns=['Employee_ID', 'Zip_Code'])
print(X_train_scaled_new)

    Employee_Annual_Salary  Monthly_Medical_contribution  \
25                0.042052                           0.2   
24               -0.690234                           0.2   
30               -0.568187                           0.2   
17               -0.690234                           0.2   
22                0.123417                           0.2   
4                 0.835362                           0.2   
2                 0.835362                           0.2   
21                0.815020                           0.2   
23               -0.527504                           0.2   
10                0.916727                           0.2   
29               -0.161361                           0.2   
28               -0.120679                           0.2   
18               -0.812282                           0.2   
6                 0.591266                           0.2   
13               -0.324091                           0.2   
7                 0.265806              

In [196]:
print(type(X_test_scaled))

<class 'pandas.core.frame.DataFrame'>


In [197]:
X_test_scaled_new = X_test_scaled.drop(columns=['Employee_ID', 'Zip_Code'])
print(X_test_scaled_new)

    Employee_Annual_Salary  Monthly_Medical_contribution  \
14               -0.242726                           0.2   
19                0.530242                           0.2   
3                 0.794679                           0.2   
27               -0.364774                           0.2   
31               -2.353134                           0.2   
26                0.896385                           0.2   
20               -0.364774                           0.2   

    Monthly_Dental_Contribution  Monthly_Vision_Contribution     Bonus  \
14                          0.2                          0.2 -0.409212   
19                          0.2                          0.2  0.407344   
3                           0.2                          0.2  0.362818   
27                          0.2                          0.2 -0.533223   
31                          0.2                          0.2 -2.201295   
26                          0.2                          0.2  0.812863   
2

## Training the Model

In [198]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [199]:
model.fit(X_train_scaled_new, y_train.values.ravel())

## Prediction

In [200]:
y_pred = model.predict(X_test_scaled_new)

In [201]:
print(y_pred)

[0 0 0 0 0 1 0]


### Saving the output as new table

In [202]:
# Get Employee_IDs from X_test
employee_ids = X_test_scaled['Employee_ID'].values

In [203]:
print("employee_ids shape:", employee_ids.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)


employee_ids shape: (7,)
y_test shape: (7, 1)
y_pred shape: (7,)


In [204]:
print(y_test)

    left
14     0
19     0
3      0
27     0
31     0
26     1
20     0


In [205]:
# Flatten y_test to 1D
y_test = y_test.values.ravel()

In [206]:
print("employee_ids shape:", employee_ids.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)

employee_ids shape: (7,)
y_test shape: (7,)
y_pred shape: (7,)


In [207]:
print(y_test)

[0 0 0 0 0 1 0]


### get top 3 features

In [208]:
top_indices = np.argsort(model.coef_[0])[::-1][:3]


In [209]:
print(top_indices)

[ 8  5 18]


In [210]:
top_features = X_train_scaled_new.columns[top_indices].tolist()

In [211]:
results_df = pd.DataFrame({
    'Employee_ID': employee_ids,
    'Actual': y_test,
    'Predicted': y_pred,
    'Cluster_label': 3,
    'primary_attrition_feature': top_features[0],
    'secondary_attrition_feature': top_features[1],
    'tertiary_attrition_feature': top_features[2]
})




In [212]:
# Create the folder if it doesn't exist
os.makedirs("7_5_am_workday_log_reg_prediction_on_cluster_3_data", exist_ok=True)

# Save the Excel file inside the folder
results_df.to_csv("7_5_am_workday_log_reg_prediction_on_cluster_3_data/7_5_am_workday_log_reg_prediction_on_cluster_3_data.csv", index=False)


In [213]:
# spark_df = spark.createDataFrame(results_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.7_5_am_workday_log_reg_prediction_on_cluster_3_data")

## Accuracy

In [214]:
model.score(X_test_scaled_new,y_test)

1.0

In [215]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Confusion Matrix:
 [[6 0]
 [0 1]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


### saving the metrics

In [216]:
Accuracy = accuracy_score(y_test, y_pred)
Precision = precision_score(y_test, y_pred)
Recall = recall_score(y_test, y_pred) 
F1_Score = f1_score(y_test, y_pred)

# Create a DataFrame
metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1_Score"],
    "Value": [Accuracy, Precision, Recall, F1_Score]
})


In [217]:
# Create the folder if it doesn't exist
os.makedirs("7_6_am_workday_log_reg_metrics_data", exist_ok=True)

# Save the Excel file inside the folder
metrics_df.to_csv("7_6_am_workday_log_reg_metrics_data/7_6_am_workday_log_reg_metrics_data.csv", index=False)


In [218]:
# spark_df = spark.createDataFrame(metrics_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.7_6_am_workday_log_reg_metrics_data")

## Bias & Weights

In [219]:
print(model.coef_)       # β₁, β₂, ..., βn
print(model.intercept_)  # β₀

[[ 0.09390019  0.02190787  0.02190787  0.02190787  0.08469375  0.55282843
  -0.40059484  0.          1.33411941 -0.3119758  -0.02113988 -0.06718198
   0.         -0.03447581 -0.01013621 -0.02532782 -0.13806344 -0.00418484
   0.35629074 -0.00784043 -0.00467585 -0.02145165  0.35089909 -0.00818676
  -0.00365627  0.          0.21808978 -0.1077808  -0.01990816 -0.00191724
  -0.09222072  0.          0.         -0.10223289 -0.06997445 -0.00906241
   0.         -0.17512839 -0.06718198 -0.15131069 -0.24506823  0.20990301
  -0.12892067 -0.0333221   0.25939417  0.1572401   0.2780511  -0.16753831
  -0.10977919]]
[-3.30188274]


In [220]:
len(model.coef_.ravel())

49

In [221]:
max(model.coef_.ravel())

np.float64(1.33411941305918)

In [222]:
feature_weights = pd.Series(model.coef_[0], index=X_train_scaled_new.columns)
print(feature_weights)


Employee_Annual_Salary                      0.093900
Monthly_Medical_contribution                0.021908
Monthly_Dental_Contribution                 0.021908
Monthly_Vision_Contribution                 0.021908
Bonus                                       0.084694
Years_Of_Service                            0.552828
Employee_HR_rate                           -0.400595
Hours_per_week                              0.000000
Years_Since_Last_Promotion                  1.334119
age                                        -0.311976
Current_Role_HR Manager                    -0.021140
Current_Role_Lead Data Engineer            -0.067182
Current_Role_Lead Data Scientist            0.000000
Current_Role_Lead DevOps Engineer          -0.034476
Current_Role_Lead Software Developer       -0.010136
Current_Role_Lead Software Engineer        -0.025328
Current_Role_Lead UX Designer              -0.138063
Current_Role_Product Designer              -0.004185
Current_Role_Product Lead                   0.

### Saving All Feature & Weights

In [223]:
weights_df = pd.DataFrame({
    'Feature': X_train_scaled_new.columns,
    'Weight': model.coef_[0] 
})

print(weights_df)

                                     Feature    Weight
0                     Employee_Annual_Salary  0.093900
1               Monthly_Medical_contribution  0.021908
2                Monthly_Dental_Contribution  0.021908
3                Monthly_Vision_Contribution  0.021908
4                                      Bonus  0.084694
5                           Years_Of_Service  0.552828
6                           Employee_HR_rate -0.400595
7                             Hours_per_week  0.000000
8                 Years_Since_Last_Promotion  1.334119
9                                        age -0.311976
10                   Current_Role_HR Manager -0.021140
11           Current_Role_Lead Data Engineer -0.067182
12          Current_Role_Lead Data Scientist  0.000000
13         Current_Role_Lead DevOps Engineer -0.034476
14      Current_Role_Lead Software Developer -0.010136
15       Current_Role_Lead Software Engineer -0.025328
16             Current_Role_Lead UX Designer -0.138063
17        

In [224]:
# Create the folder if it doesn't exist
os.makedirs("7_1_am_workday_all_feature_vs_weights_on_cluster_3_data", exist_ok=True)

# Save the Excel file inside the folder
weights_df.to_csv("7_1_am_workday_all_feature_vs_weights_on_cluster_3_data/7_1_am_workday_all_feature_vs_weights_on_cluster_3_data.csv", index=False)

In [225]:
# spark_df = spark.createDataFrame(weights_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.7_1_am_workday_all_feature_vs_weights_on_cluster_3_data")

### Saving Current_Role & Weights

In [226]:
# pyspark_df = spark.table("sd_bdc_demo.k_cluster_logistic_regression.7_1_am_workday_all_feature_vs_weights_on_cluster_3_data")
# pyspark_df.display()
# all_feature_df = pyspark_df.toPandas()

In [227]:
all_feature_df = pd.read_csv("7_1_am_workday_all_feature_vs_weights_on_cluster_3_data/7_1_am_workday_all_feature_vs_weights_on_cluster_3_data.csv")

In [228]:
print(all_feature_df.head())

                        Feature    Weight
0        Employee_Annual_Salary  0.093900
1  Monthly_Medical_contribution  0.021908
2   Monthly_Dental_Contribution  0.021908
3   Monthly_Vision_Contribution  0.021908
4                         Bonus  0.084694


In [229]:
all_feature_df_Current_Role = all_feature_df[all_feature_df['Feature'].str.startswith('Current_Role_')]
print(all_feature_df_Current_Role)


                                     Feature    Weight
10                   Current_Role_HR Manager -0.021140
11           Current_Role_Lead Data Engineer -0.067182
12          Current_Role_Lead Data Scientist  0.000000
13         Current_Role_Lead DevOps Engineer -0.034476
14      Current_Role_Lead Software Developer -0.010136
15       Current_Role_Lead Software Engineer -0.025328
16             Current_Role_Lead UX Designer -0.138063
17             Current_Role_Product Designer -0.004185
18                 Current_Role_Product Lead  0.356291
19    Current_Role_Product Marketing Manager -0.007840
20              Current_Role_Project Manager -0.004676
21     Current_Role_Senior Account Executive -0.021452
22       Current_Role_Senior Account Manager  0.350899
23      Current_Role_Senior Business Analyst -0.008187
24        Current_Role_Senior Content Writer -0.003656
25       Current_Role_Senior DevOps Engineer  0.000000
26     Current_Role_Senior Financial Analyst  0.218090
27        

In [230]:
Current_Role_df = all_feature_df_Current_Role.copy()

In [231]:
Current_Role_df['Feature'] = Current_Role_df['Feature'].str.removeprefix('Current_Role_')


In [232]:
print(Current_Role_df)

                        Feature    Weight
10                   HR Manager -0.021140
11           Lead Data Engineer -0.067182
12          Lead Data Scientist  0.000000
13         Lead DevOps Engineer -0.034476
14      Lead Software Developer -0.010136
15       Lead Software Engineer -0.025328
16             Lead UX Designer -0.138063
17             Product Designer -0.004185
18                 Product Lead  0.356291
19    Product Marketing Manager -0.007840
20              Project Manager -0.004676
21     Senior Account Executive -0.021452
22       Senior Account Manager  0.350899
23      Senior Business Analyst -0.008187
24        Senior Content Writer -0.003656
25       Senior DevOps Engineer  0.000000
26     Senior Financial Analyst  0.218090
27            Senior HR Manager -0.107781
28     Senior Marketing Manager -0.019908
29  Senior Marketing Specialist -0.001917
30       Senior Product Manager -0.092221
31       Senior Project Manager  0.000000
32       Senior Sales Executive  0

In [233]:
# Create the folder if it doesn't exist
os.makedirs("7_2_am_workday_current_role_vs_weights_on_cluster_3_data", exist_ok=True)

# Save the Excel file inside the folder
Current_Role_df.to_csv("7_2_am_workday_current_role_vs_weights_on_cluster_3_data/7_2_am_workday_current_role_vs_weights_on_cluster_3_data.csv", index=False)

In [234]:
# spark_df = spark.createDataFrame(Current_Role_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.7_2_am_workday_current_role_vs_weights_on_cluster_3_data")

### Saving Department & Weights

In [235]:
all_feature_df_Department = all_feature_df[all_feature_df['Feature'].str.startswith('Department_')]
print(all_feature_df_Department)


                       Feature    Weight
38     Department_Data Science -0.067182
39           Department_Design -0.151311
40      Department_Engineering -0.245068
41          Department_Finance  0.209903
42  Department_Human Resources -0.128921
43        Department_Marketing -0.033322
44          Department_Product  0.259394
45            Department_Sales  0.157240


In [236]:
Department_df = all_feature_df_Department.copy()

In [237]:
Department_df['Feature'] = Department_df['Feature'].str.removeprefix('Department_')

In [238]:
# Create the folder if it doesn't exist
os.makedirs("7_3_am_workday_department_vs_weights_on_cluster_3_data", exist_ok=True)

# Save the Excel file inside the folder
Department_df.to_csv("7_3_am_workday_department_vs_weights_on_cluster_3_data/7_3_am_workday_department_vs_weights_on_cluster_3_data.csv", index=False)

In [239]:
# spark_df = spark.createDataFrame(Department_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.7_3_am_workday_department_vs_weights_on_cluster_3_data")

### Saving Gender & Weights

In [240]:
all_feature_df_Gender = all_feature_df[all_feature_df['Feature'].str.startswith('Gender_')]
print(all_feature_df_Gender)


              Feature    Weight
46      Gender_Female  0.278051
47        Gender_Male -0.167538
48  Gender_Non-Binary -0.109779


In [241]:
Gender_df = all_feature_df_Gender.copy()

In [242]:
Gender_df['Feature'] = Gender_df['Feature'].str.removeprefix('Gender_')

In [243]:
# Create the folder if it doesn't exist
os.makedirs("7_4_am_workday_gender_vs_weights_on_cluster_3_data", exist_ok=True)

# Save the Excel file inside the folder
Gender_df.to_csv("7_4_am_workday_gender_vs_weights_on_cluster_3_data/7_4_am_workday_gender_vs_weights_on_cluster_3_data.csv", index=False)

In [244]:
# spark_df = spark.createDataFrame(Gender_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.7_4_am_workday_gender_vs_weights_on_cluster_3_data")

## Create a table by combining predictions of cluster 1,2,3

In [245]:
# pyspark_df_cluster_1 = spark.table("sd_bdc_demo.k_cluster_logistic_regression.5_5_am_workday_log_reg_prediction_on_cluster_1_data")
# pyspark_df_cluster_2 = spark.table("sd_bdc_demo.k_cluster_logistic_regression.6_5_am_workday_log_reg_prediction_on_cluster_2_data")
# pyspark_df_cluster_3 = spark.table("sd_bdc_demo.k_cluster_logistic_regression.7_5_am_workday_log_reg_prediction_on_cluster_3_data")

# df_cluster_1 = pyspark_df_cluster_1.toPandas()
# df_cluster_2 = pyspark_df_cluster_2.toPandas()
# df_cluster_3 = pyspark_df_cluster_3.toPandas()

In [246]:
df_cluster_1 = pd.read_csv("5_5_am_workday_log_reg_prediction_on_cluster_1_data/5_5_am_workday_log_reg_prediction_on_cluster_1_data.csv")
df_cluster_2 = pd.read_csv("6_5_am_workday_log_reg_prediction_on_cluster_2_data/6_5_am_workday_log_reg_prediction_on_cluster_2_data.csv")
df_cluster_3 = pd.read_csv("7_5_am_workday_log_reg_prediction_on_cluster_3_data/7_5_am_workday_log_reg_prediction_on_cluster_3_data.csv")



In [247]:
df_cluster_123 = pd.concat([df_cluster_1, df_cluster_2, df_cluster_3], ignore_index=True)

In [248]:
# Create the folder if it doesn't exist
os.makedirs("7_7_am_workday_log_reg_prediction_on_cluster_123_data", exist_ok=True)

# Save the Excel file inside the folder
df_cluster_123.to_csv("7_7_am_workday_log_reg_prediction_on_cluster_123_data/7_7_am_workday_log_reg_prediction_on_cluster_123_data.csv", index=False)

In [249]:
# spark_df_123 = spark.createDataFrame(df_cluster_123)
# spark_df_123.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.7_7_am_workday_log_reg_prediction_on_cluster_123_data")