In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# Load processed data from previous EDA notebook
df = pd.read_csv("../data/interim/cleaned_data.csv")  # or continue using df from the previous cell

# Load the label encoder
le = joblib.load("../models/label_encoder.pkl")

X = df['processed_resume']
y = df['label']


In [None]:
vectorizer = TfidfVectorizer(max_features=1500)
X_tfidf = vectorizer.fit_transform(X)

# Save vectorizer
joblib.dump(vectorizer, '../models/tfidf_vectorizer.pkl')


In [None]:
# Load processed data from previous EDA notebook
df = pd.read_csv("../data/interim/cleaned_data.csv")  # or continue using df from the previous cell

# Load the label encoder
le = joblib.load("../models/label_encoder.pkl")

X = df['processed_resume']
y = df['label']


In [None]:
vectorizer = TfidfVectorizer(max_features=1500)
X_tfidf = vectorizer.fit_transform(X)

# Save vectorizer
joblib.dump(vectorizer, '../models/tfidf_vectorizer.pkl')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'SVM': LinearSVC()
}

def evaluate_model(model, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n📊 {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 6))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_, cmap='Blues')
    plt.title(f'{name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

for name, model in models.items():
    evaluate_model(model, name)


In [None]:
# Let's assume Logistic Regression gave the best results
joblib.dump(models['Logistic Regression'], '../models/logistic_model.pkl')
