### Machine Learning Model Implementation
In this section, we use the extracted textual and structural features to build and train a machine learning model. The model will learn to predict similarity or match between resumes and job descriptions based on these features.


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()

In [None]:
features_dff=scaler.fit_transform(features_df)

In [None]:
features_dff=pd.DataFrame(features_dff,columns=features_df.columns)

In [None]:
features_dff["match_score"]=df["match_score"]

In [None]:
features_df=features_dff

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=features_df.iloc[:,0:-1]

In [None]:
y=features_df.iloc[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

### RIDGE REGRESSION 

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, confusion_matrix
import numpy as np
# parameter grid for alpha (regularization strength)
param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}


cv = KFold(n_splits=5, shuffle=True, random_state=42)

ridge = Ridge()
grid = GridSearchCV(ridge, param_grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, verbose=0, return_train_score=False)
grid.fit(X_train, y_train)

best_ridge = grid.best_estimator_
print("Best alpha:", grid.best_params_['alpha'])

feature_cols = X_train.columns.tolist()

y_pred_cont = best_ridge.predict(X_test)
y_pred_round = np.clip(np.round(y_pred_cont), int(np.min(y_train)), int(np.max(y_train))).astype(int)

print("Ridge Test MAE:", mean_absolute_error(y_test, y_pred_round))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_round))


acc = accuracy_score(y_test, y_pred_round)
print("Ridge Test Accuracy:", acc)

###  ORDINAL LOGISTIC REGRESSION 

In [None]:
from mord import LogisticIT

param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# base estimator
ord_log = LogisticIT()

grid = GridSearchCV(
    estimator=ord_log,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=cv,
    n_jobs=-1,
    verbose=0,
    return_train_score=False
)

grid.fit(X_train, y_train)

best_ord = grid.best_estimator_
print("Best alpha:", grid.best_params_['alpha'])


y_pred = best_ord.predict(X_test)

print("Ordinal Logistic Test MAE:", mean_absolute_error(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

feature_cols = X_train.columns.tolist()
acc = accuracy_score(y_test, y_pred)
print("logistic Test Accuracy:", acc)


### Support Vector Regression (SVR) Model

In [None]:
from sklearn.svm import SVR

param_grid = {
    'kernel': ['rbf', 'linear'],
    'C': [1, 10, 100],
    'epsilon': [0.01, 0.1, 0.5],
    'gamma': ['scale','auto']
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
svr = SVR()
grid = GridSearchCV(svr, param_grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, verbose=0)
grid.fit(X_train, y_train)

best_svr = grid.best_estimator_
print("Best params:", grid.best_params_)

y_pred_cont = best_svr.predict(X_test)
y_pred_round = np.clip(np.round(y_pred_cont), 1, 5).astype(int)

print("SVR Test MAE:", mean_absolute_error(y_test, y_pred_round))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_round))

feature_cols = X_train.columns.tolist()

acc = accuracy_score(y_test, y_pred_round)
print("SVR Test Accuracy:", acc)

### XGBOOST

In [None]:
from xgboost import XGBRegressor
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'reg_lambda': [0, 1],   # L2 regularization
    'reg_alpha': [0, 0.1]   # L1 regularization
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

xgb = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1, verbosity=0)

grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=cv,
    n_jobs=-1,
    verbose=0,
    return_train_score=False
)

grid.fit(X_train, y_train)

best_xgb = grid.best_estimator_
print("Best params:", grid.best_params_)


y_pred_cont = best_xgb.predict(X_test)
y_pred_round = np.clip(np.round(y_pred_cont),
                       int(np.min(y_train)),
                       int(np.max(y_train))).astype(int)

print("XGBoost Test MAE:", mean_absolute_error(y_test, y_pred_round))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_round))

feature_cols = X_train.columns.tolist()
acc = accuracy_score(y_test, y_pred_round)
print("XGBoost Test Accuracy:", acc)
