# Logistic Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
df = pd.read_excel("updated_data/updated_data.xlsx")


In [3]:
print(df.head())

                Current_Role  Zip Code  Employee Annual Salary$  \
0   Senior Software Engineer     94103                   130800   
1   Associate Data Scientist     94111                    74250   
2  Associate Product Manager     10009                   125850   
3           Business Analyst     60605                    84750   
4    Chief Operating Officer      2101                   300000   

   Monthly Medical contribution$  Monthly Dental Contribution$  \
0                            130                            35   
1                            105                            30   
2                            130                            35   
3                            105                            30   
4                            230                            55   

   Monthly Vision Contribution$  Bonus $  Years_Of_Service    Department  \
0                            19    13865               5.2   Engineering   
1                            17     4344        

In [4]:
print(df.shape)

(100, 15)


## Encoding categorical data

### Encoding the Independent Variable

In [5]:
df_encoded = pd.get_dummies(df, columns=['Current_Role', 'Department', 'Gender'], drop_first=False).astype(int)


In [6]:
print(df_encoded.head())

   Zip Code  Employee Annual Salary$  Monthly Medical contribution$  \
0     94103                   130800                            130   
1     94111                    74250                            105   
2     10009                   125850                            130   
3     60605                    84750                            105   
4      2101                   300000                            230   

   Monthly Dental Contribution$  Monthly Vision Contribution$  Bonus $  \
0                            35                            19    13865   
1                            30                            17     4344   
2                            35                            19    15920   
3                            30                            17     5890   
4                            55                            27    90000   

   Years_Of_Service  Employee HR rate  # of Hours per week  \
0                 5                60                   40   
1   

In [7]:
print(df_encoded.shape)

(100, 104)


## Split two Data Frame for Independent Features & Dependent Columns

In [8]:
df_independent = df_encoded.drop(columns=['left'])
X = df_independent
df_dependent = df_encoded[['left']]
y = df_dependent

In [9]:
print(df_independent.head())


   Zip Code  Employee Annual Salary$  Monthly Medical contribution$  \
0     94103                   130800                            130   
1     94111                    74250                            105   
2     10009                   125850                            130   
3     60605                    84750                            105   
4      2101                   300000                            230   

   Monthly Dental Contribution$  Monthly Vision Contribution$  Bonus $  \
0                            35                            19    13865   
1                            30                            17     4344   
2                            35                            19    15920   
3                            30                            17     5890   
4                            55                            27    90000   

   Years_Of_Service  Employee HR rate  # of Hours per week  \
0                 5                60                   40   
1   

In [10]:
print(df_independent.shape)

(100, 103)


In [11]:
print(df_dependent.head())

   left
0     0
1     0
2     0
3     0
4     1


In [12]:
print(df_dependent.shape)

(100, 1)


## Splitting the dataset into the Training set and Test set

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [14]:
print(X_train.head())

    Zip Code  Employee Annual Salary$  Monthly Medical contribution$  \
2      10009                   125850                            130   
73     90011                   136000                            130   
97     78712                    88100                            105   
62     98102                   190000                            155   
19      2108                   220000                            180   

    Monthly Dental Contribution$  Monthly Vision Contribution$  Bonus $  \
2                             35                            19    15920   
73                            35                            19    15300   
97                            30                            17     5594   
62                            40                            21    38000   
19                            45                            23    50600   

    Years_Of_Service  Employee HR rate  # of Hours per week  \
2                  1               147               

In [15]:
print(X_train.shape)

(80, 103)


In [16]:
print(X_test.head())

    Zip Code  Employee Annual Salary$  Monthly Medical contribution$  \
80      2104                   139600                            130   
84     98102                   147200                            130   
33      2107                   141600                            130   
81     10004                   133200                            130   
93     94112                   149000                            130   

    Monthly Dental Contribution$  Monthly Vision Contribution$  Bonus $  \
80                            35                            19    16333   
84                            35                            19    18621   
33                            35                            19    20390   
81                            35                            19    14519   
93                            35                            19    18253   

    Years_Of_Service  Employee HR rate  # of Hours per week  \
80                 7                82               

In [17]:
print(X_test.shape)

(20, 103)


In [18]:
print(y_train.head())

    left
2      0
73     1
97     0
62     1
19     1


In [19]:
print(y_train.shape)

(80, 1)


In [20]:
print(y_test.head())

    left
80     0
84     1
33     0
81     0
93     0


In [21]:
print(y_test.shape)

(20, 1)


## Feature Scaling

In [22]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# List of columns to scale
columns_to_scale = [
    'Zip Code', 'Employee Annual Salary$', 'Monthly Medical contribution$',
    'Monthly Dental Contribution$', 'Monthly Vision Contribution$', 'Bonus $',
    'Years_Of_Service', 'Employee HR rate', '# of Hours per week',
    'Years_Since_Last_Promotion', 'age'
]

# Fit sc on training data and transform both training and test sets
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[columns_to_scale] = sc.fit_transform(X_train_scaled[columns_to_scale])
X_test_scaled[columns_to_scale] = sc.transform(X_test_scaled[columns_to_scale])


In [23]:
print(X_train_scaled)

    Zip Code  Employee Annual Salary$  Monthly Medical contribution$  \
2  -1.564536                -0.078349                       0.030762   
73  0.667639                 0.108960                       0.030762   
97  0.352380                -0.774991                      -0.789546   
62  0.893390                 1.105480                       0.851069   
19 -1.784986                 1.659103                       1.671377   
..       ...                      ...                            ...   
75 -0.152860                 0.027762                       0.030762   
9  -1.564731                -0.011914                       0.030762   
72  0.667555                 0.108960                       0.030762   
12  0.781840                -0.910628                      -0.789546   
37  0.782007                -1.210507                      -0.789546   

    Monthly Dental Contribution$  Monthly Vision Contribution$   Bonus $  \
2                       0.030762                      0.030

In [24]:
print(X_test_scaled)

    Zip Code  Employee Annual Salary$  Monthly Medical contribution$  \
80 -1.785097                 0.175395                       0.030762   
84  0.893390                 0.315646                       0.030762   
33 -1.785013                 0.212303                       0.030762   
81 -1.564675                 0.057288                       0.030762   
93  0.782063                 0.348863                       0.030762   
17  0.782063                -0.691947                      -0.789546   
36  0.782035                -0.961377                      -0.789546   
82 -1.564675                 0.027762                       0.030762   
69  0.667611                 1.470871                       1.671377   
65 -1.564508                 0.319336                       0.030762   
92  0.781812                 1.659103                       1.671377   
39  0.352910                -1.205894                      -0.789546   
56  0.893362                -0.850652                      -0.78

## Training the Model

In [25]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [26]:
model.fit(X_train_scaled, y_train.values.ravel())

## Prediction

In [27]:
y_pred = model.predict(X_test_scaled)

## Accuracy

In [28]:
model.score(X_test_scaled,y_test)

0.95

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Confusion Matrix:
 [[17  0]
 [ 1  2]]
Accuracy: 0.95
Precision: 1.0
Recall: 0.6666666666666666
F1 Score: 0.8
