In [12]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import warnings
warnings.filterwarnings('ignore')

In [5]:
df_HR =pd.read_csv("Resources/hr.csv")
df_HR.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,20,50through99,Pvt Ltd,1,More,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50through99,Pvt Ltd,5,Between,0.0
2,11561,city_21,0.624,Other,No relevent experience,Full time course,Graduate,STEM,5,50through99,Pvt Ltd,0,Less,0.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,20,50through99,Funded Startup,4,More,0.0
5,21651,city_176,0.764,Other,Has relevent experience,Part time course,Graduate,STEM,11,50through99,Pvt Ltd,1,More,1.0


# Encoding

In [7]:
# Generate our categorical variable list
hr_cat = df_HR.dtypes[df_HR.dtypes == "object"].index.tolist()
hr_cat

['city',
 'gender',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'company_size',
 'company_type',
 'training_hours']

In [10]:
# Check the number of unique values in each column
df_HR[hr_cat].nunique()

city                   123
gender                   3
relevent_experience      2
enrolled_university      3
education_level          5
major_discipline         6
company_size             8
company_type             6
training_hours           3
dtype: int64

In [13]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df_HR[hr_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names_out(hr_cat)
encode_df.head()

Unnamed: 0,city_city_1,city_city_10,city_city_100,city_city_101,city_city_102,city_city_103,city_city_104,city_city_105,city_city_106,city_city_107,...,company_size_<10,company_type_Early Stage Startup,company_type_Funded Startup,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,training_hours_Between,training_hours_Less,training_hours_More
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [16]:
# Merge one-hot encoded features and drop the originals
df_HR = df_HR.merge(encode_df,left_index=True, right_index=True)
df_HR = df_HR.drop(hr_cat,1)
df_HR

Unnamed: 0,enrollee_id,city_development_index,experience,last_new_job,target,city_city_1,city_city_10,city_city_100,city_city_101,city_city_102,...,company_size_<10,company_type_Early Stage Startup,company_type_Funded Startup,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,training_hours_Between,training_hours_Less,training_hours_More
0,8949,0.920,20,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,29725,0.776,15,5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,11561,0.624,5,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,666,0.767,20,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
5,21651,0.764,11,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,24169,0.897,14,5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
18378,26496,0.698,20,5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
18379,21543,0.887,9,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
18380,30491,0.926,11,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


# Split the Data into Training and Testing

In [46]:
# Create our features
x_cols = [i for i in df_HR.columns if i not in ('enrollee_id', 'target')]
X = df_HR[x_cols]
y = df_HR['target']

In [47]:
X.describe()

Unnamed: 0,city_development_index,training_hours
count,19158.0,19158.0
mean,0.828848,65.366896
std,0.123362,60.058462
min,0.448,1.0
25%,0.74,23.0
50%,0.903,47.0
75%,0.92,88.0
max,0.949,336.0


In [48]:
# Check the balance of our target values
y.value_counts()

0.0    14381
1.0     4777
Name: target, dtype: int64

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)


In [50]:
print(Counter(y_train))

Counter({0.0: 10785, 1.0: 3583})


# ML

In [51]:
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [52]:
# Train the data
classifier.fit(X_train, y_train)

ValueError: could not convert string to float: 'Male'