In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,Bidirectional, LSTM, Dropout, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import random
from numpy.random import seed
random.set_seed(2)
seed(1)
import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
#Configuration parameters

#Add header in the roles-responsibility-corpus
headers = ["Description"]

#Number of rows to be selected from corpus
nrows=1000 #for testing set 100, but actually it should be 1000

#corpus file path
corpus_file_path = '/content/drive/MyDrive/nlp-job-generator/app/main/resources/data/jd_roles_responsibility_corpus_v1.0.csv'
       
#all_headlines = pd.read_csv("test_res.csv", names = headers, sep='\t',nrows= 1000)       
all_headlines = pd.read_csv(corpus_file_path, names = headers, sep = '\t',nrows = nrows)       

#Print lenght of all headlines from corpus
len(all_headlines)

1000

In [None]:
all_headlines

In [7]:
all_headlines.shape

(1000, 1)

In [9]:
#List description text from corpus
corpus_header = "Description"
description_texts = list(set(all_headlines[corpus_header]))
print ("Description Text:\n")
print (description_texts)

Description Text:



In [10]:
#Print description text length
len(description_texts)

978

In [None]:
#Clear the corpus text by removing any punctuation marks etc
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [x for x in description_texts]
corpus[:100]

In [None]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
# inp_sequences[:10]
inp_sequences

In [13]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
print(max_sequence_len)

1000


In [14]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

lstm_model = create_model(max_sequence_len, total_words)

lstm_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 999, 10)           43360     
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 4336)              437936    
Total params: 525,696
Trainable params: 525,696
Non-trainable params: 0
_________________________________________________________________


In [16]:
lstm_model.fit(predictors, label, epochs=50, verbose=5)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f5e702d9090>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
from keras.models import load_model
model_path = '/content/drive/MyDrive/nlp-job-generator/app/main/resources/models/lstm/lstm_model.h5'
lstm_model.save(model_path, overwrite=True, include_optimizer=True)  # creates a HDF5 file 'my_model.h5'

In [18]:
!pip install pydantic

Collecting pydantic
[?25l  Downloading https://files.pythonhosted.org/packages/2b/a3/0ffdb6c63f45f10d19b8e8b32670b22ed089cafb29732f6bf8ce518821fb/pydantic-1.8.1-cp37-cp37m-manylinux2014_x86_64.whl (10.1MB)
[K     |████████████████████████████████| 10.1MB 22.4MB/s 
Installing collected packages: pydantic
Successfully installed pydantic-1.8.1


In [19]:
from pydantic import BaseModel

class LstmJdGenerator(BaseModel):
  seedText: str
  nextWords: int

In [20]:
#Define function to generate text based on seed_text, number of words in the text, 
#trained model and max sequence length
def generateText(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [21]:
#Generate and print text based on sample inputs
job_description = generateText("Strong Organization & Project Management Skills", 110, lstm_model, max_sequence_len)
print(job_description)


Strong Organization & Project Management Skills And Or Commercial Full Time Time Collaborative Environment Company Company And Support To Support The Data Scientist To Be Responsible For The Data Acquisition And The Data Acquisition And Analytics Of The Data Scientist Will Be Starting On The Data Science Team You Will Be Part Of The Data Science Team And Supports Enterprise Data Science And Analytics Statistics Data Science Data Science Or Data Science And Analytics Capabilities In The Ability To Collect Organize And Disseminate Significant Amounts Of Information From Attention To Detail And Accuracy Ability To Analyze And Prioritize Technical Problems And Data Architecture To Generate Data Sources And Support Of 3Rd Party And Or Model Data


In [22]:
#Install colabcode and fastapi
!pip install colabcode
!pip install fastapi

Collecting colabcode
  Downloading https://files.pythonhosted.org/packages/33/f4/5c69125dd58eb86b1c668bf2a449120750b984d03bbcfd583ffcf3bfef48/colabcode-0.2.0-py3-none-any.whl
Collecting nest-asyncio==1.4.3
  Downloading https://files.pythonhosted.org/packages/5c/33/10805a3359f56ac4f3b520e64b9d5e6a288d87be95777b8023c64cba60f1/nest_asyncio-1.4.3-py3-none-any.whl
Collecting jupyterlab==3.0.7
[?25l  Downloading https://files.pythonhosted.org/packages/90/27/149c258b8e80552ba1ad35636eca308776a284cb151cb8fcfff70adfbd0a/jupyterlab-3.0.7-py3-none-any.whl (8.3MB)
[K     |████████████████████████████████| 8.3MB 13.7MB/s 
[?25hCollecting uvicorn==0.13.1
[?25l  Downloading https://files.pythonhosted.org/packages/ef/67/546c35e9fffb585ea0608ba3bdcafe17ae402e304367203d0b08d6c23051/uvicorn-0.13.1-py3-none-any.whl (45kB)
[K     |████████████████████████████████| 51kB 7.5MB/s 
[?25hCollecting pyngrok>=5.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/6b/4e/a2fe095bbe17cf26424c4abcd2

Collecting fastapi
[?25l  Downloading https://files.pythonhosted.org/packages/9f/33/1b643f650688ad368983bbaf3b0658438038ea84d775dd37393d826c3833/fastapi-0.63.0-py3-none-any.whl (50kB)
[K     |████████████████████████████████| 51kB 6.6MB/s 
[?25hCollecting starlette==0.13.6
[?25l  Downloading https://files.pythonhosted.org/packages/c5/a4/c9e228d7d47044ce4c83ba002f28ff479e542455f0499198a3f77c94f564/starlette-0.13.6-py3-none-any.whl (59kB)
[K     |████████████████████████████████| 61kB 7.9MB/s 
Installing collected packages: starlette, fastapi
Successfully installed fastapi-0.63.0 starlette-0.13.6


In [18]:
#import libraries for loading saved model, fast api, colabcode
import tensorflow as tf
from tensorflow import keras
from colabcode import ColabCode
from fastapi import FastAPI
from keras.models import load_model

import logging
from fastapi import FastAPI

app=FastAPI(title="NlpJdGeneratorAPI", description="NLP based LSTM model Job Description Generator")

#initializing logging
my_logger = logging.getLogger()
my_logger.setLevel(logging.DEBUG)
logging.basicConfig(level=logging.DEBUG, filename='logs.log')

#Initalize lstml model to load and model file path
lstm_model_loaded = None
model_path = '/content/drive/MyDrive/nlp-job-generator/app/main/resources/models/lstm/lstm_model.h5'
max_sequence_length = 1000

@app.on_event("startup")
#Returns a compiled model identical to the saved after trained
def load_saved_model():
  global lstm_model_loaded
  lstm_model_loaded = tf.keras.models.load_model(model_path)

@app.post("/api")
async def getJobDescription(inputData:LstmJdGenerator):
  try:
    print(inputData)
    my_logger.debug("inputData:", inputData)

    prediction = generateText(inputData.seedText, inputData.nextWords, lstm_model_loaded, max_sequence_length)
    my_logger.debug("prediction:", prediction)
    print(prediction)

    return {"job_description" : prediction}
  except:
    my_logger.error("Someting went wrong!")
    return {"prediction": "error"}

In [None]:
from colabcode import ColabCode
from fastapi import FastAPI
cc = ColabCode(port=1200, code=False, authtoken="1qhOBp2p5qxw80yQipxR0JHwMbl_5choeHxGXkp6HVXZ66hTh")
cc.run_app(app=app)