In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import Dataset

In [2]:
data=pd.read_csv("UpdatedResumeDataSet.csv")

In [3]:
data.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [4]:
a=np.unique(list(data["Category"]))

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  962 non-null    object
 1   Resume    962 non-null    object
dtypes: object(2)
memory usage: 15.2+ KB


In [6]:
class Vectorizer():
    def __init__(self,clean_pattern=None,max_features=None,stop_words=None):
        self.clean_pattern = clean_pattern
        self.max_features = max_features
        self.stopwords = stop_words
        self.tfidf = TfidfVectorizer(stop_words=self.stopwords,max_features=self.max_features)
        self.builded = False


    def _clean_texts(self,texts):

        cleaned = []
        for text in texts:
            if self.clean_pattern is not None:
                text = re.sub(self.clean_pattern," ",text)

            text = text.lower().strip()
            cleaned.append(text)

        return cleaned


    def _set_tfidf(self,cleaned_texts):
        self.tfidf.fit(cleaned_texts)

    def build_vectorizer(self,texts):
        cleaned_texts = self._clean_texts(texts)
        self._set_tfidf(cleaned_texts)
        self.builded = True

    def vectorizeTexts(self,texts):
        if self.builded:
            cleaned_texts = self._clean_texts(texts)
            return self.tfidf.transform(cleaned_texts)

        else:
            raise Exception("Vectorizer is not builded.")

In [7]:
x = list(data["Resume"])
y = list(data["Category"])

In [8]:
vectorizer = Vectorizer("[^a-zA-Z0-9!?.,]",max_features=7000,stop_words="english");

In [9]:
vectorizer.build_vectorizer(x)

In [10]:
vectorized_x=vectorizer.vectorizeTexts(x).toarray()

In [11]:
yint=data["Category"].unique()
y_label={yint[i]:i for i in range(len(yint))}
y_label

{'Data Science': 0,
 'HR': 1,
 'Advocate': 2,
 'Arts': 3,
 'Web Designing': 4,
 'Mechanical Engineer': 5,
 'Sales': 6,
 'Health and fitness': 7,
 'Civil Engineer': 8,
 'Java Developer': 9,
 'Business Analyst': 10,
 'SAP Developer': 11,
 'Automation Testing': 12,
 'Electrical Engineering': 13,
 'Operations Manager': 14,
 'Python Developer': 15,
 'DevOps Engineer': 16,
 'Network Security Engineer': 17,
 'PMO': 18,
 'Database': 19,
 'Hadoop': 20,
 'ETL Developer': 21,
 'DotNet Developer': 22,
 'Blockchain': 23,
 'Testing': 24}

In [12]:
y_encoded = []
for y_sample in y:
  y_encoded.append(y_label[y_sample])

y_encoded=np.asarray(y_encoded)

In [13]:
y_encoded.shape

(962,)

In [14]:
vectorized_x.shape


(962, 7000)

In [15]:
class ResumeDataset(Dataset):

    def __init__(self,x_vectorized,y_encoded):
        self.x_vectorized = x_vectorized
        self.y_encoded = y_encoded


    def __len__(self):
        return len(self.x_vectorized)


    def __getitem__(self,index):
        return self.x_vectorized[index],self.y_encoded[index]

In [16]:
dataset = ResumeDataset(vectorized_x,y_encoded)
print("Length of our dataset is",len(dataset))

print(dataset[1])


Length of our dataset is 962
(array([0., 0., 0., ..., 0., 0., 0.]), 0)


In [17]:
train_indices,test_indices = train_test_split(list(range(0,len(dataset))),test_size=0.25,random_state=42)

In [18]:
print(len(train_indices))
print(len(test_indices))

721
241


In [19]:
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)

In [20]:
BATCH_SIZE = 128
train_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE,
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE,
                                                sampler=test_sampler)

In [21]:
class DenseNetwork(nn.Module):

    def __init__(self):
        super(DenseNetwork,self).__init__()
        self.fc1 = nn.Linear(7000,5000)
        self.drop1 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(5000,3000)
        self.drop2 = nn.Dropout(0.4)
        self.fc3 = nn.Linear(3000,1000)
        self.drop3 = nn.Dropout(0.3)
        self.prediction = nn.Linear(1000,25)

    def forward(self,x):

        x = F.relu(self.fc1(x.to(torch.float)))
        x = self.drop1(x)
        x = F.relu(self.fc2(x))
        x = self.drop2(x)
        x = F.relu(self.fc3(x))
        x = self.drop3(x)
        x = F.log_softmax(self.prediction(x),dim=1)

        return x

In [22]:
device = torch.device("cuda")
device

device(type='cuda')

In [23]:
model = DenseNetwork().to(device)

In [24]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adamax(model.parameters(),lr=1e-3)

In [25]:
EPOCHS = 10
TRAIN_LOSSES = []
TRAIN_ACCURACIES = []

for epoch in range(1,EPOCHS+1):
    epoch_loss = 0.0
    epoch_true = 0
    epoch_total = 0
    for data_,target_ in train_loader:
        data_ = data_.to(device)
        target_ = target_.to(device)

        # Cleaning optimizer cache.
        optimizer.zero_grad()

        # Forward propagation
        outputs = model(data_)

        # Computing loss & backward propagation
        loss = criterion(outputs,target_)
        loss.backward()

        # Applying gradients
        optimizer.step()

        epoch_loss += loss.item()

        _,pred = torch.max(outputs,dim=1)
        epoch_true = epoch_true + torch.sum(pred == target_).item()

        epoch_total += target_.size(0)

    TRAIN_LOSSES.append(epoch_loss)
    TRAIN_ACCURACIES.append(100 * epoch_true / epoch_total)

    print(f"Epoch {epoch}/{EPOCHS} finished: train_loss = {epoch_loss}, train_accuracy = {TRAIN_ACCURACIES[epoch-1]}")

Epoch 1/10 finished: train_loss = 18.99882197380066, train_accuracy = 9.153952843273231
Epoch 2/10 finished: train_loss = 17.033807277679443, train_accuracy = 12.89875173370319
Epoch 3/10 finished: train_loss = 13.452606201171875, train_accuracy = 47.156726768377254
Epoch 4/10 finished: train_loss = 9.325356245040894, train_accuracy = 65.3259361997226
Epoch 5/10 finished: train_loss = 5.637182295322418, train_accuracy = 84.60471567267683
Epoch 6/10 finished: train_loss = 3.058367758989334, train_accuracy = 89.18169209431345
Epoch 7/10 finished: train_loss = 1.9039355516433716, train_accuracy = 91.67822468793342
Epoch 8/10 finished: train_loss = 1.36634761095047, train_accuracy = 96.11650485436893
Epoch 9/10 finished: train_loss = 0.8673368692398071, train_accuracy = 98.47434119278779
Epoch 10/10 finished: train_loss = 0.41291163116693497, train_accuracy = 99.30651872399446


In [26]:
test_true = 0
test_total = len(test_sampler)
test_loss = 0.0
with torch.no_grad():
    for data_,target_ in validation_loader:
        data_,target_ = data_.to(device),target_.to(device)

        outputs = model(data_)

        loss = criterion(outputs,target_).item()

        _,pred = torch.max(outputs,dim=1)

        test_true += torch.sum(pred==target_).item()
        test_loss += loss


print(f"Validation finished: Accuracy = {round(100 * test_true / test_total,2)}%, Loss = {test_loss}")

Validation finished: Accuracy = 99.17%, Loss = 0.23698172718286514
