In [2]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# ------------------
# Load Data
# ------------------
CLEANED = Path("../data/cleaned_resumes.csv")
RAW = Path("../data/UpdatedResumeDataSet.csv")

df = pd.read_csv(CLEANED if CLEANED.exists() else RAW)
text_col = 'Cleaned_Resume' if 'Cleaned_Resume' in df.columns else 'Resume'
y = df['Category']
texts = df[text_col].astype(str)

# ------------------
# Train-Test Split
# ------------------
X_train_text, X_test_text, y_train, y_test = train_test_split(
    texts, y, test_size=0.2, random_state=42, stratify=y
)

# Vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(X_train_text)
X_test = tfidf.transform(X_test_text)

# ------------------
# Train Model
# ------------------
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ------------------
# Save Model + Vectorizer
# ------------------
MODELS_DIR = Path("../models")
MODELS_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(log_reg, MODELS_DIR / "log_reg_model.pkl")
joblib.dump(tfidf, MODELS_DIR / "tfidf_vectorizer.pkl")

print("✅ Model and vectorizer saved successfully!")


Accuracy: 0.9948186528497409
                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         4
                     Arts       1.00      1.00      1.00         7
       Automation Testing       0.83      1.00      0.91         5
               Blockchain       1.00      1.00      1.00         8
         Business Analyst       1.00      1.00      1.00         6
           Civil Engineer       1.00      1.00      1.00         5
             Data Science       1.00      1.00      1.00         8
                 Database       1.00      1.00      1.00         7
          DevOps Engineer       1.00      0.91      0.95        11
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         8
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      1.00      1.00         9
                   Hadoop       