In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC

In [45]:
df = pd.read_csv(r'D:\Hutson\learning-materials\AI&ML\Khóa 12-02AIMLDLCV nâng cao\Class\Datasets\recruitment_data.csv')
df

Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy,HiringDecision
0,26,1,2,0,3,26.783828,48,78,91,1,1
1,39,1,4,12,3,25.862694,35,68,80,2,1
2,48,0,2,3,2,9.920805,20,67,13,2,0
3,34,1,2,5,2,6.407751,36,27,70,3,0
4,30,0,1,6,1,43.105343,23,52,85,2,0
...,...,...,...,...,...,...,...,...,...,...,...
1495,48,0,2,3,4,9.183783,66,3,80,3,1
1496,27,1,2,10,3,14.847731,43,97,7,2,0
1497,24,1,1,1,2,4.289911,31,91,58,1,1
1498,48,0,2,4,4,36.299263,9,37,44,2,1


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  1500 non-null   int64  
 1   Gender               1500 non-null   int64  
 2   EducationLevel       1500 non-null   int64  
 3   ExperienceYears      1500 non-null   int64  
 4   PreviousCompanies    1500 non-null   int64  
 5   DistanceFromCompany  1500 non-null   float64
 6   InterviewScore       1500 non-null   int64  
 7   SkillScore           1500 non-null   int64  
 8   PersonalityScore     1500 non-null   int64  
 9   RecruitmentStrategy  1500 non-null   int64  
 10  HiringDecision       1500 non-null   int64  
dtypes: float64(1), int64(10)
memory usage: 129.0 KB


In [47]:
target = 'HiringDecision'
x = df.drop(target, axis=1)
y = df[target]

In [48]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(1200, 10) (300, 10) (1200,) (300,)


In [49]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Choose model

In [30]:
params = {
    "n_estimators": [50, 100, 200, 300, 400, 500],
    "criterion": ['gini', 'entropy'],
    "max_depth": [10, 20, 30, 40, 50],
    "random_state": [100]
}

model = GridSearchCV(RandomForestClassifier(),param_grid=params, scoring = 'accuracy', cv=6, verbose=2, n_jobs= -1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

Fitting 6 folds for each of 60 candidates, totalling 360 fits
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       215
           1       0.94      0.86      0.90        85

    accuracy                           0.94       300
   macro avg       0.94      0.92      0.93       300
weighted avg       0.94      0.94      0.94       300



In [32]:
print(model.best_params_, model.best_score_)

{'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 200, 'random_state': 100} 0.9258333333333334


In [35]:
for i in range(len(y_test)):
    print("predicted: ", y_pred[i], "actual: ", y_test.iloc[i])

predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  1 actual:  1
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  1
predicted:  0 actual:  0
predicted:  1 actual:  1
predicted:  1 actual:  1
predicted:  1 actual:  1
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  1 actual:  1
predicted:  0 actual:  0
predicted:  1 actual:  1
predicted:  0 actual:  0
predicted:  0 actual:  1
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  1 actual:  1
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  0 actual:  0
predicted:  1 actual:  1
predicted:  0 actual:  0


## SVM

In [42]:
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       215
           1       0.89      0.80      0.84        85

    accuracy                           0.92       300
   macro avg       0.91      0.88      0.89       300
weighted avg       0.92      0.92      0.92       300



## Logistics

In [50]:
params = {
    "penalty": ['l1', 'l2'],
    "C": [0.1, 0.5, 1, 5, 10],
    "solver": ['liblinear'],
    "max_iter": [100, 200, 300],
    "class_weight": ['balanced', None],
    "random_state": [100]
}
model = GridSearchCV(LogisticRegression(random_state=100),param_grid=params, scoring = 'accuracy', cv=6, verbose=2, n_jobs= -1)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

Fitting 6 folds for each of 60 candidates, totalling 360 fits
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       215
           1       0.84      0.73      0.78        85

    accuracy                           0.88       300
   macro avg       0.87      0.84      0.85       300
weighted avg       0.88      0.88      0.88       300

