In [None]:
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import Dataset

In [None]:
data=pd.read_csv("UpdatedResumeDataSet.csv")

In [None]:
data.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [None]:
a=np.unique(list(data["Category"]))

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  962 non-null    object
 1   Resume    962 non-null    object
dtypes: object(2)
memory usage: 15.2+ KB


In [None]:
class Vectorizer():
    def __init__(self,clean_pattern=None,max_features=None,stop_words=None):
        self.clean_pattern = clean_pattern
        self.max_features = max_features
        self.stopwords = stop_words
        self.tfidf = TfidfVectorizer(stop_words=self.stopwords,max_features=self.max_features)
        self.builded = False


    def _clean_texts(self,texts):

        cleaned = []
        for text in texts:
            if self.clean_pattern is not None:
                text = re.sub(self.clean_pattern," ",text)

            text = text.lower().strip()
            cleaned.append(text)

        return cleaned


    def _set_tfidf(self,cleaned_texts):
        self.tfidf.fit(cleaned_texts)

    def build_vectorizer(self,texts):
        cleaned_texts = self._clean_texts(texts)
        self._set_tfidf(cleaned_texts)
        self.builded = True

    def vectorizeTexts(self,texts):
        if self.builded:
            cleaned_texts = self._clean_texts(texts)
            return self.tfidf.transform(cleaned_texts)

        else:
            raise Exception("Vectorizer is not builded.")

In [None]:
x = list(data["Resume"])
y = list(data["Category"])

In [None]:
vectorizer = Vectorizer("[^a-zA-Z0-9!?.,]",max_features=3000,stop_words="english");

In [None]:
vectorizer.build_vectorizer(x)

In [None]:
vectorized_x=vectorizer.vectorizeTexts(x).toarray()

In [None]:
yint=data["Category"].unique()
y_label={yint[i]:i for i in range(len(yint))}
label_y = {i: yint[i] for i in range(len(yint))}
label_y

{0: 'Data Science',
 1: 'HR',
 2: 'Advocate',
 3: 'Arts',
 4: 'Web Designing',
 5: 'Mechanical Engineer',
 6: 'Sales',
 7: 'Health and fitness',
 8: 'Civil Engineer',
 9: 'Java Developer',
 10: 'Business Analyst',
 11: 'SAP Developer',
 12: 'Automation Testing',
 13: 'Electrical Engineering',
 14: 'Operations Manager',
 15: 'Python Developer',
 16: 'DevOps Engineer',
 17: 'Network Security Engineer',
 18: 'PMO',
 19: 'Database',
 20: 'Hadoop',
 21: 'ETL Developer',
 22: 'DotNet Developer',
 23: 'Blockchain',
 24: 'Testing'}

In [None]:
y_encoded = []
for y_sample in y:
  y_encoded.append(y_label[y_sample])

y_encoded=np.asarray(y_encoded)

In [None]:
y_encoded.shape

(962,)

In [None]:
vectorized_x.shape


(962, 3000)

In [None]:
class ResumeDataset(Dataset):

    def __init__(self,x_vectorized,y_encoded):
        self.x_vectorized = x_vectorized
        self.y_encoded = y_encoded


    def __len__(self):
        return len(self.x_vectorized)


    def __getitem__(self,index):
        return self.x_vectorized[index],self.y_encoded[index]

In [None]:
dataset = ResumeDataset(vectorized_x,y_encoded)
print("Length of our dataset is",len(dataset))

print(dataset[1])


Length of our dataset is 962
(array([0., 0., 0., ..., 0., 0., 0.]), 0)


In [None]:
train_indices,test_indices = train_test_split(list(range(0,len(dataset))),test_size=0.30,random_state=42)

In [None]:
print(len(train_indices))
print(len(test_indices))

673
289


In [None]:
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)

In [None]:
BATCH_SIZE = 128
train_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE,
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE,
                                                sampler=test_sampler)

In [None]:
class DenseNetwork(nn.Module):

    def __init__(self):
        super(DenseNetwork,self).__init__()
        self.fc1 = nn.Linear(3000,1000)
        self.drop1 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(1000,500)
        self.drop2 = nn.Dropout(0.4)
        self.prediction = nn.Linear(500,25)

    def forward(self,x):

        x = F.relu(self.fc1(x.to(torch.float)))
        x = self.drop1(x)
        x = F.relu(self.fc2(x))
        x = self.drop2(x)
        x = F.log_softmax(self.prediction(x),dim=1)

        return x

In [None]:
device = torch.device("cuda")
device

device(type='cuda')

In [None]:
model = DenseNetwork().to(device)

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adamax(model.parameters(),lr=1e-3)

In [None]:
EPOCHS = 25
TRAIN_LOSSES = []
TRAIN_ACCURACIES = []

for epoch in range(1,EPOCHS+1):
    epoch_loss = 0.0
    epoch_true = 0
    epoch_total = 0
    for data_,target_ in train_loader:
        data_ = data_.to(device)
        target_ = target_.to(device)

        # Cleaning optimizer cache.
        optimizer.zero_grad()

        # Forward propagation
        outputs = model(data_)

        # Computing loss & backward propagation
        loss = criterion(outputs,target_)
        loss.backward()

        # Applying gradients
        optimizer.step()

        epoch_loss += loss.item()

        _,pred = torch.max(outputs,dim=1)
        epoch_true = epoch_true + torch.sum(pred == target_).item()

        epoch_total += target_.size(0)

    TRAIN_LOSSES.append(epoch_loss)
    TRAIN_ACCURACIES.append(100 * epoch_true / epoch_total)

    print(f"Epoch {epoch}/{EPOCHS} finished: train_loss = {epoch_loss}, train_accuracy = {TRAIN_ACCURACIES[epoch-1]}")

In [None]:
test_true = 0
test_total = len(test_sampler)
test_loss = 0.0
with torch.no_grad():
    for data_,target_ in validation_loader:
        data_,target_ = data_.to(device),target_.to(device)

        outputs = model(data_)

        loss = criterion(outputs,target_).item()

        _,pred = torch.max(outputs,dim=1)

        test_true += torch.sum(pred==target_).item()
        test_loss += loss


print(f"Validation finished: Accuracy = {round(100 * test_true / test_total,2)}%, Loss = {test_loss}")

In [None]:
def predict_resume_category(resume_text):
    model.eval()
    with torch.no_grad():
        # Preprocess and vectorize the input text
        cleaned_text = vectorizer._clean_texts([resume_text])
        vectorized_text = vectorizer.vectorizeTexts(cleaned_text).toarray()

        # Convert to tensor and move to the appropriate device
        tensor_text = torch.tensor(vectorized_text).to(device)


        # Pass through the model to get predictions
        outputs = model(tensor_text)

        # Get the predicted category
        _, predicted_category = torch.max(outputs, dim=1)

        # Convert the predicted category to the actual label
        predicted_label = label_y[predicted_category.item()]

        return predicted_label

# Example usage
new_resume = ""

predicted_category = predict_resume_category(new_resume)
print(f"The predicted category for the new resume is: {predicted_category}")
