In [2]:
import pandas as pd

df=pd.read_csv('Resume.csv')

print(df.head())
print()

#distribution of categories
print(df['Category'].value_counts().head())

         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  

Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
ADVOCATE                  118
CHEF                      118
ENGINEERING               118
Name: count, dtype: int64


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase and split into words
    words = text.lower().split()
    # Remove stopwords and apply stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Rejoin words into a single string
    return ' '.join(words)

# Apply cleaning to the 'Text' column
df['clean'] = df['Resume_str'].apply(clean_text)
print(df['clean'].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    hr administr market associ hr administr summar...
1    hr specialist us hr oper summari versatil medi...
2    hr director summari year experi recruit plu ye...
3    hr specialist summari dedic driven dynam year ...
4    hr manag skill highlight hr skill hr depart st...
Name: clean, dtype: object


In [6]:
# feature extraction
# converts text into numerical features using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_features=2000)

X=tfidf.fit_transform(df['clean'])
print(X.shape)

(2484, 2000)


In [7]:
#encoding

from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
y=le.fit_transform(df['Category'])
print(le.classes_)

['ACCOUNTANT' 'ADVOCATE' 'AGRICULTURE' 'APPAREL' 'ARTS' 'AUTOMOBILE'
 'AVIATION' 'BANKING' 'BPO' 'BUSINESS-DEVELOPMENT' 'CHEF' 'CONSTRUCTION'
 'CONSULTANT' 'DESIGNER' 'DIGITAL-MEDIA' 'ENGINEERING' 'FINANCE' 'FITNESS'
 'HEALTHCARE' 'HR' 'INFORMATION-TECHNOLOGY' 'PUBLIC-RELATIONS' 'SALES'
 'TEACHER']


In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

print(X_train.shape,X_test.shape)
#We evaluate on unseen data; stratify=y keeps class distribution even in both sets.

(1987, 2000) (497, 2000)


In [9]:
# model selection

from sklearn.linear_model import LogisticRegression

model=LogisticRegression(max_iter=500,class_weight='balanced')
model.fit(X_train,y_train)



In [10]:
from sklearn.metrics import accuracy_score, classification_report

pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred, target_names=le.classes_))


Accuracy: 0.6740442655935613
                        precision    recall  f1-score   support

            ACCOUNTANT       0.65      0.83      0.73        24
              ADVOCATE       0.48      0.50      0.49        24
           AGRICULTURE       0.78      0.54      0.64        13
               APPAREL       0.60      0.32      0.41        19
                  ARTS       0.46      0.29      0.35        21
            AUTOMOBILE       0.50      0.57      0.53         7
              AVIATION       0.86      0.79      0.83        24
               BANKING       0.94      0.65      0.77        23
                   BPO       0.50      0.50      0.50         4
  BUSINESS-DEVELOPMENT       0.47      0.79      0.59        24
                  CHEF       0.81      0.71      0.76        24
          CONSTRUCTION       0.83      0.86      0.84        22
            CONSULTANT       0.50      0.17      0.26        23
              DESIGNER       0.85      0.81      0.83        21
         D

In [11]:
import pickle

pickle.dump(model, open("resume_lr_model.pkl", "wb"))
pickle.dump(tfidf, open("resume_tfidf.pkl", "wb"))
pickle.dump(le, open("resume_label_encoder.pkl", "wb"))
