U of C HackTheBias January 2026

Resume Classifier Model

January 17th 2026

In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

In [14]:
# load data
df = pd.read_csv("../data/clean_data/cleaned_data.csv")

X = df['content']
y = df['label']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipelines for both models
lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])

svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC(max_iter=2000))
])

# Train both models
lr_pipeline.fit(X_train, y_train)
svm_pipeline.fit(X_train, y_train)

# Evaluate both models on test set
lr_pred = lr_pipeline.predict(X_test)
svm_pred = svm_pipeline.predict(X_test)

In [16]:
# print results
print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print("\nClassification Report:\n", classification_report(y_test, lr_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, lr_pred))


print("Linear SVM Results:")
print("Accuracy:", accuracy_score(y_test, svm_pred))
print("\nClassification Report:\n", classification_report(y_test, svm_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, svm_pred))

Logistic Regression Results:
Accuracy: 0.9980648282535075

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1561
           1       1.00      1.00      1.00       506

    accuracy                           1.00      2067
   macro avg       1.00      1.00      1.00      2067
weighted avg       1.00      1.00      1.00      2067


Confusion Matrix:
 [[1559    2]
 [   2  504]]
Linear SVM Results:
Accuracy: 0.9995162070633768

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1561
           1       1.00      1.00      1.00       506

    accuracy                           1.00      2067
   macro avg       1.00      1.00      1.00      2067
weighted avg       1.00      1.00      1.00      2067


Confusion Matrix:
 [[1560    1]
 [   0  506]]


In [17]:
# Cross-validation (using full training data)

print("5-fold Cross-Validation Results:")

lr_cv_results = cross_validate(lr_pipeline, 
                                X_train, 
                                y_train, 
                                cv=5, 
                                scoring=['accuracy', 'precision', 'recall', 'f1'],
)
svm_cv_results = cross_validate(svm_pipeline, 
                                X_train, 
                                y_train, 
                                cv=5, 
                                scoring=['accuracy', 'precision', 'recall', 'f1'],
)

print("\nLogistic Regression CV Scores:")
print(f"  Accuracy:  {lr_cv_results['test_accuracy'].mean():.4f}")
print(f"  Precision: {lr_cv_results['test_precision'].mean():.4f}")
print(f"  Recall:    {lr_cv_results['test_recall'].mean():.4f}")
print(f"  F1:        {lr_cv_results['test_f1'].mean():.4f}")

print("\nLinear SVM CV Scores:")
print(f"  Accuracy:  {svm_cv_results['test_accuracy'].mean():.4f}")
print(f"  Precision: {svm_cv_results['test_precision'].mean():.4f}")
print(f"  Recall:    {svm_cv_results['test_recall'].mean():.4f}")
print(f"  F1:        {svm_cv_results['test_f1'].mean():.4f}")

5-fold Cross-Validation Results:

Logistic Regression CV Scores:
  Accuracy:  0.9983
  Precision: 0.9995
  Recall:    0.9934
  F1:        0.9964

Linear SVM CV Scores:
  Accuracy:  0.9996
  Precision: 1.0000
  Recall:    0.9985
  F1:        0.9992

Logistic Regression CV Scores:
  Accuracy:  0.9983
  Precision: 0.9995
  Recall:    0.9934
  F1:        0.9964

Linear SVM CV Scores:
  Accuracy:  0.9996
  Precision: 1.0000
  Recall:    0.9985
  F1:        0.9992


In [None]:
# LinearSVM scores very well and marginally better than LogisticRegression.
# save the model:

joblib.dump(svm_pipeline, '../model/resume_classifier_svm.joblib')

['../model/resume_classifier_svm.pkl']