In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk
from sklearn.preprocessing import LabelEncoder

# Download NLTK resources

In [2]:

nltk.download('stopwords')
nltk.download('wordnet') 

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load the data

In [9]:

data = pd.read_csv('/kaggle/input/resume-preprocess/preproced_data.csv')
X = data['cleaned_text']
y = data['job_role']

# Text Preprocessing Function

In [10]:

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return " ".join(text)

X = X.apply(preprocess_text)

# Encode the categorical labels to numeric labels

In [11]:

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [12]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Vectorize the text data

In [13]:

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3), min_df=5, max_df=0.8)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Balance the dataset using SMOTE

In [14]:

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectorized, y_train)


# Models

In [15]:

lr = LogisticRegressionCV(Cs=10, max_iter=500, random_state=42, cv=5, class_weight='balanced')
rf = RandomForestClassifier(max_depth=15, n_estimators=200, random_state=42, class_weight='balanced')
gx = XGBClassifier(n_estimators=250, learning_rate=0.05, max_depth=5, min_child_weight=3, subsample=0.8, colsample_bytree=0.8, random_state=42)
gb = GradientBoostingClassifier(n_estimators=250, learning_rate=0.05, max_depth=5, subsample=0.8, random_state=42)


# Stacking Classifier

In [18]:

estimators = [('Logistic Regression', lr), ('Random Forest', rf), ('XGBoost', gx), ('Gradient Boosting', gb)]
stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegressionCV(cv=5, max_iter=1000, class_weight='balanced'), n_jobs=-1, cv=5)


In [19]:
models = {'Logistic Regression': lr, 'Random Forest': rf, 'XGBoost': gx, 'Gradient Boosting': gb, 'Stacking Classifier': stack}


# Training and Evaluation

In [None]:

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test_vectorized)
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print(f"Accuracy Score: {accuracy_score(y_train, y_pred)}")
    print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")



Training Logistic Regression...


# Plot Confusion Matrix for Stacking Model

In [None]:

stack_pred = stack.predict(X_test_vectorized)
conf_matrix = confusion_matrix(y_test, stack_pred)
plt.figure(figsize=(15, 10))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix - Stacking Classifier")
plt.show()


Note : output of these file is remove due to different different types of experiments
This file run at kaggle notebook due system less system confreguration