# Logistic Regression

## Importing the libraries

In [529]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [530]:
df = pd.read_excel("updated_data/updated_data.xlsx")


In [531]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [532]:
print(df.head())

                Current_Role  Zip Code  Employee_ID  Employee Annual Salary$  \
0   Senior Software Engineer     94103           57                   130800   
1   Associate Data Scientist     94111           58                    74250   
2  Associate Product Manager     10009           59                   125850   
3           Business Analyst     60605           60                    84750   
4    Chief Operating Officer      2101           61                   300000   

   Monthly Medical contribution$  Monthly Dental Contribution$  \
0                            130                            35   
1                            105                            30   
2                            130                            35   
3                            105                            30   
4                            230                            55   

   Monthly Vision Contribution$  Bonus $  Years_Of_Service    Department  \
0                            19    13865      

In [533]:
print(df.shape)

(100, 16)


## Encoding categorical data

### Encoding the Independent Variable

In [534]:
df_encoded = pd.get_dummies(df, columns=['Current_Role', 'Department', 'Gender'], drop_first=False).astype(int)


In [535]:
print(df_encoded.head())

   Zip Code  Employee_ID  Employee Annual Salary$  \
0     94103           57                   130800   
1     94111           58                    74250   
2     10009           59                   125850   
3     60605           60                    84750   
4      2101           61                   300000   

   Monthly Medical contribution$  Monthly Dental Contribution$  \
0                            130                            35   
1                            105                            30   
2                            130                            35   
3                            105                            30   
4                            230                            55   

   Monthly Vision Contribution$  Bonus $  Years_Of_Service  Employee HR rate  \
0                            19    13865                 5                60   
1                            17     4344                 1               117   
2                            19    15920    

In [536]:
print(df_encoded.shape)

(100, 105)


## Split two Data Frame for Independent Features & Dependent Columns

In [537]:
df_independent = df_encoded.drop(columns=['left'])
X = df_independent
df_dependent = df_encoded[['left']]
y = df_dependent

In [538]:
print(df_independent.head())


   Zip Code  Employee_ID  Employee Annual Salary$  \
0     94103           57                   130800   
1     94111           58                    74250   
2     10009           59                   125850   
3     60605           60                    84750   
4      2101           61                   300000   

   Monthly Medical contribution$  Monthly Dental Contribution$  \
0                            130                            35   
1                            105                            30   
2                            130                            35   
3                            105                            30   
4                            230                            55   

   Monthly Vision Contribution$  Bonus $  Years_Of_Service  Employee HR rate  \
0                            19    13865                 5                60   
1                            17     4344                 1               117   
2                            19    15920    

In [539]:
print(df_independent.shape)

(100, 104)


In [540]:
print(df_dependent.head())

   left
0     0
1     0
2     0
3     0
4     1


In [541]:
print(df_dependent.shape)

(100, 1)


## Splitting the dataset into the Training set and Test set

In [542]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [543]:
print(X_train.head())

    Zip Code  Employee_ID  Employee Annual Salary$  \
2      10009           59                   125850   
73     90011          130                   136000   
97     78712          154                    88100   
62     98102          119                   190000   
19      2108           76                   220000   

    Monthly Medical contribution$  Monthly Dental Contribution$  \
2                             130                            35   
73                            130                            35   
97                            105                            30   
62                            155                            40   
19                            180                            45   

    Monthly Vision Contribution$  Bonus $  Years_Of_Service  Employee HR rate  \
2                             19    15920                 1               147   
73                            19    15300                 6                68   
97                           

In [544]:
print(X_train.shape)

(80, 104)


In [545]:
print(X_test.head())

    Zip Code  Employee_ID  Employee Annual Salary$  \
80      2104          137                   139600   
84     98102          141                   147200   
33      2107           90                   141600   
81     10004          138                   133200   
93     94112          150                   149000   

    Monthly Medical contribution$  Monthly Dental Contribution$  \
80                            130                            35   
84                            130                            35   
33                            130                            35   
81                            130                            35   
93                            130                            35   

    Monthly Vision Contribution$  Bonus $  Years_Of_Service  Employee HR rate  \
80                            19    16333                 7                82   
84                            19    18621                 9               130   
33                           

In [546]:
print(X_test.shape)

(20, 104)


In [547]:
print(y_train.head())

    left
2      0
73     1
97     0
62     1
19     1


In [548]:
print(y_train.shape)

(80, 1)


In [549]:
print(y_test.head())

    left
80     0
84     1
33     0
81     0
93     0


In [550]:
print(y_test.shape)

(20, 1)


## Feature Scaling

In [551]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# List of columns to scale
columns_to_scale = [
    'Zip Code', 'Employee Annual Salary$', 'Monthly Medical contribution$',
    'Monthly Dental Contribution$', 'Monthly Vision Contribution$', 'Bonus $',
    'Years_Of_Service', 'Employee HR rate', '# of Hours per week',
    'Years_Since_Last_Promotion', 'age'
]

# Fit sc on training data and transform both training and test sets
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[columns_to_scale] = sc.fit_transform(X_train_scaled[columns_to_scale])
X_test_scaled[columns_to_scale] = sc.transform(X_test_scaled[columns_to_scale])


In [552]:
print(X_train_scaled)

    Zip Code  Employee_ID  Employee Annual Salary$  \
2  -1.564536           59                -0.078349   
73  0.667639          130                 0.108960   
97  0.352380          154                -0.774991   
62  0.893390          119                 1.105480   
19 -1.784986           76                 1.659103   
..       ...          ...                      ...   
75 -0.152860          132                 0.027762   
9  -1.564731           66                -0.011914   
72  0.667555          129                 0.108960   
12  0.781840           69                -0.910628   
37  0.782007           94                -1.210507   

    Monthly Medical contribution$  Monthly Dental Contribution$  \
2                        0.030762                      0.030762   
73                       0.030762                      0.030762   
97                      -0.789546                     -0.789546   
62                       0.851069                      0.851069   
19              

In [553]:
print(X_test_scaled)

    Zip Code  Employee_ID  Employee Annual Salary$  \
80 -1.785097          137                 0.175395   
84  0.893390          141                 0.315646   
33 -1.785013           90                 0.212303   
81 -1.564675          138                 0.057288   
93  0.782063          150                 0.348863   
17  0.782063           74                -0.691947   
36  0.782035           93                -0.961377   
82 -1.564675          139                 0.027762   
69  0.667611          126                 1.470871   
65 -1.564508          122                 0.319336   
92  0.781812          149                 1.659103   
39  0.352910           96                -1.205894   
56  0.893362          113                -0.850652   
52 -1.564703          109                 0.120955   
51 -1.564648          108                -1.168063   
32 -1.785013           89                -1.127464   
31 -1.785013           88                -1.186517   
44  0.782063          101   

### Removing Employee ID in `X_train_scaled` & `X_test_scaled`

In [554]:
print(type(X_train_scaled))

<class 'pandas.core.frame.DataFrame'>


In [555]:
X_train_scaled_new = X_train_scaled.drop('Employee_ID', axis=1)
print(X_train_scaled_new)

    Zip Code  Employee Annual Salary$  Monthly Medical contribution$  \
2  -1.564536                -0.078349                       0.030762   
73  0.667639                 0.108960                       0.030762   
97  0.352380                -0.774991                      -0.789546   
62  0.893390                 1.105480                       0.851069   
19 -1.784986                 1.659103                       1.671377   
..       ...                      ...                            ...   
75 -0.152860                 0.027762                       0.030762   
9  -1.564731                -0.011914                       0.030762   
72  0.667555                 0.108960                       0.030762   
12  0.781840                -0.910628                      -0.789546   
37  0.782007                -1.210507                      -0.789546   

    Monthly Dental Contribution$  Monthly Vision Contribution$   Bonus $  \
2                       0.030762                      0.030

In [556]:
print(type(X_test_scaled))

<class 'pandas.core.frame.DataFrame'>


In [557]:
X_test_scaled_new = X_test_scaled.drop('Employee_ID', axis=1)
print(X_test_scaled_new)

    Zip Code  Employee Annual Salary$  Monthly Medical contribution$  \
80 -1.785097                 0.175395                       0.030762   
84  0.893390                 0.315646                       0.030762   
33 -1.785013                 0.212303                       0.030762   
81 -1.564675                 0.057288                       0.030762   
93  0.782063                 0.348863                       0.030762   
17  0.782063                -0.691947                      -0.789546   
36  0.782035                -0.961377                      -0.789546   
82 -1.564675                 0.027762                       0.030762   
69  0.667611                 1.470871                       1.671377   
65 -1.564508                 0.319336                       0.030762   
92  0.781812                 1.659103                       1.671377   
39  0.352910                -1.205894                      -0.789546   
56  0.893362                -0.850652                      -0.78

## Training the Model

In [558]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [559]:
model.fit(X_train_scaled_new, y_train.values.ravel())

## Prediction

In [560]:
y_pred = model.predict(X_test_scaled_new)

In [561]:
print(y_pred)

[0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]


### Saving the output as new table

In [562]:
# Get Employee_IDs from X_test
employee_ids = X_test_scaled['Employee_ID'].values

In [563]:
print("employee_ids shape:", employee_ids.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)


employee_ids shape: (20,)
y_test shape: (20, 1)
y_pred shape: (20,)


In [564]:
print(y_test)

    left
80     0
84     1
33     0
81     0
93     0
17     0
36     0
82     0
69     0
65     1
92     1
39     0
56     0
52     0
51     0
32     0
31     0
44     0
78     0
10     0


In [565]:
# Flatten y_test to 1D
y_test = y_test.values.ravel()

In [566]:
print("employee_ids shape:", employee_ids.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)

employee_ids shape: (20,)
y_test shape: (20,)
y_pred shape: (20,)


In [567]:
print(y_test)

[0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0]


In [568]:
results_df = pd.DataFrame({
    'Employee_ID': employee_ids,
    'Actual': y_test,
    'Predicted': y_pred
})

# Create the folder if it doesn't exist
os.makedirs("final_data", exist_ok=True)

# Save the Excel file inside the folder
results_df.to_excel("final_data/final_data.xlsx", index=False)


## Accuracy

In [569]:
model.score(X_test_scaled_new,y_test)

0.95

In [570]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Confusion Matrix:
 [[17  0]
 [ 1  2]]
Accuracy: 0.95
Precision: 1.0
Recall: 0.6666666666666666
F1 Score: 0.8


## Bias & Weights

In [571]:
print(model.coef_)       # β₁, β₂, ..., βn
print(model.intercept_)  # β₀

[[ 4.66251026e-02  1.93351137e-01 -1.76073136e-02 -1.76073136e-02
  -1.76073136e-02  1.15189059e-01  5.36775596e-01 -2.75422788e-01
   0.00000000e+00  2.25926518e+00 -3.05633406e-01  8.47808627e-05
   1.76989581e-04 -2.14818817e-03  1.42734693e-01 -2.96528754e-01
  -4.36770956e-03 -2.20916847e-03 -1.48211785e-04  0.00000000e+00
  -3.38333725e-03 -1.58708736e-03  8.26142006e-02  0.00000000e+00
   2.13078524e-01  2.84108512e-01 -8.91862333e-02 -5.28158639e-02
  -1.34707921e-02 -3.52510465e-03 -2.93124099e-02  0.00000000e+00
  -2.09664061e-03 -1.74620618e-02  0.00000000e+00 -1.25444854e-04
   0.00000000e+00 -4.59952568e-05  5.05721129e-05 -6.50936828e-02
   0.00000000e+00 -2.57014694e-02 -9.57494474e-02 -1.13977640e-02
  -2.12665806e-02 -1.38199977e-01  0.00000000e+00  0.00000000e+00
  -4.47268180e-03 -2.70072449e-03 -1.45133002e-05  2.68476547e-01
  -3.24125818e-02 -9.64692506e-02  2.59419424e-01 -1.04445019e-02
  -9.46839603e-02  0.00000000e+00 -1.67109992e-03 -4.26579706e-03
  -1.39268