### Introduction


**Probelm Statement:** Make a classifier which takes in a job description and gives the department name for it.
*   Use a neural network model
*   Make use of a pre-trained Word Embeddings (example: Word2Vec, GloVe, etc.)
*   Calculate the accuracy on a test set (data not used to train the model)

**Problem Solving Approach:** 
_Provide a brief description of steps you followed for solving this problem_
1. Create a Dataframe by taking the required columns from the given data.
2. Create word to vec matrix by preformng various operation like tokenizing, stemming etc. using nltk.
3. Remove the rows having null values.
4. Split the prepared data into train, validation and test set.
5. Train the model using train and validation data.
6. Finally, test the result.

### Part I: Text Preprocessing

_Include all text preprocesing steps like processing of json,csv files & data cleaning in this part._

Import neccessary packages in below cell

In [None]:
import pandas as pd
import json
import numpy as np
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 

In [None]:
# function to filter the words
def get_filter_words(example_sent):
  
    stop_words = set(stopwords.words('english'))
    
    example_sent=example_sent.lower()
    word_tokens = word_tokenize(example_sent)
    
    filtered_words = [] 
    for i in word_tokens:
        if i.isalpha():
            filtered_words.append(i)
            
     
    filtered_words1 = []
    for w in filtered_words: 
        if w not in stop_words: 
            filtered_words1.append(w) 
      
    filtered_words1=set(filtered_words1)
    
    ps=PorterStemmer()
    
    temp=[]
    for i in filtered_words1:
        temp.append(ps.stem(i))
        
    return temp


In [None]:
depart_doc=pd.read_csv('data/document_departments.csv')

li=[]
m=depart_doc["Document ID"]

for i in range(len(m)):
    #print(i)
    st='data/docs/'+str(m[i])+".json"
    li.append([depart_doc["Department"][i]])
    
    with open(st) as f:
        dat = json.load(f)
        des=dat["jd_information"]["description"]
        if des=="":
            des=pd.NaT
        li[i].append(des)
    
    
data=pd.DataFrame(columns=["ID","Description"],data=li)

data.count()

dataset=data.dropna()

dataset.groupby("ID").count()

columns=set([])
rows=[]
dictn={}

for i in dataset["Description"]:
    t=get_filter_words(i)
    for wrd in t:
        columns.add(wrd)
    rows.append(t)
    
for i in columns:
    dictn[i]=[]
    for j in rows:
        dictn[i].append(j.count(i))
        
        
df=pd.DataFrame(data=dictn)
df["Category"],_=pd.factorize(dataset["ID"])

df.to_csv(path_or_buf="spot_mod_data.csv",index=False)


### Part II: Exploratoty Data Analysis

_Include EDA steps like finding distribution of Departments in this part, you may also use plots for EDA._

In [None]:
df.describe

In [None]:
df.info

In [None]:
df.groupby(["Category"]).count()

### Part III: Modelling & Evaluation

_Include all model prepration & evaluation steps in this part._

In [None]:
dataset=pd.read_csv('data/spot_mod_data.csv')

# drop the columns having count<4
q1=dataset.iloc[:,:-1].sum(axis=0)
q2=list(q1[q1<4].index)
ds=dataset.drop(columns=q2)

# save the data into csv to get trin, validate, test files
i=int(0.2* len(ds))
j=int(0.2*i)
ds[j:-i].to_csv(path_or_buf="train.csv",index=False)
ds[:j].to_csv(path_or_buf="validate.csv",index=False)
ds[-i:].to_csv(path_or_buf="test.csv",index=False)


In [None]:
import torch
from torch import nn
import torch.utils.data as data
from torch import optim
import numpy as np

#create a class to get the tensor data from csv
class my_points(data.Dataset):
    def __init__(self, filename):
        pd_data = pd.read_csv(filename).values
        self.data = pd_data[:,:-1] 
        self.target = pd_data[:,-1:] 
        self.n_samples = self.data.shape[0]
    
    def __len__(self):   # Length of the dataset.
        return self.n_samples
    
    def __getitem__(self, index):   # Function that returns one point and one label.
        return torch.Tensor(self.data[index]), torch.Tensor(self.target[index])
    
# create the dataloader.
train= my_points('train.csv')
test=my_points('test.csv')
validate=my_points('validate.csv')

batch_size = 20
trainloader = data.DataLoader(train,batch_size=batch_size,num_workers=0)
testloader = data.DataLoader(test,batch_size=batch_size,num_workers=0)
validloader = data.DataLoader(validate,batch_size=batch_size,num_workers=0)


# define a model
model = nn.Sequential(nn.Linear(1203, 512),
                      nn.ReLU(),
                      nn.Dropout(0.2),
                      nn.Linear(512, 256),
                      nn.ReLU(),
                      nn.Dropout(0.2),
                      nn.Linear(256, 27),
                      nn.LogSoftmax(dim=1)
                      )

criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.03)


n_epochs=1000
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model
    model.train() # prep model for training
    for datas, target in trainloader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(datas)
        target = target.long()
        target=target.squeeze(1)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*datas.size(0)
        
       
    # validate the model 
    model.eval() # prep model for evaluation
    for data1, target in validloader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data1)
        # calculate the loss
        target = target.long()
        target=target.squeeze(1)
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()*data1.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(trainloader.dataset)
    valid_loss = valid_loss/len(validloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model, 's_model.pt')
        valid_loss_min = valid_loss


In [None]:
#load the model
model=torch.load('s_model.pt')
  
# Test the model

test_loss=0.0

matched=[0 for i in range(27)]
total=[0 for i in range(27)]
with torch.no_grad():
    model.eval()
    for dat,target in testloader:
        
        output=model(dat)
        #print(output)
        _,pred=torch.max(output,1)
        target = target.long()
        target=target.squeeze(1)
        loss=criterion(output,target)
        test_loss+=loss.item()*dat.size(0)
        
        for i in range(len(pred)):
            xx=int(target[i].item())
            if xx==pred[i]:
                matched[xx]+=1
            total[xx]+=1
            
    test_loss=test_loss/len(test)

print("Test Loss",test_loss)
    
print("Overall Accuracy:",(sum(matched)/sum(total))*100)

**Results Summary:**
_Provide a brief summary of results obtained like model accuracy & other insights based on EDA & your interpretations_

1. There are more than 400 rows having null value, i.e no description present
2. Overall Accuracy is approx 65% 