In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import random
from numpy.random import seed
random.set_seed(2)
seed(1)
import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
#Configuration parameters

#Add header in the roles-responsibility-corpus
headers = ["Description"]

#Number of rows to be selected from corpus    
nrows=1000 #for testing set 100, but actually it should be 1000

#corpus file path
corpus_file_path = '/content/drive/MyDrive/nlp-job-generator/app/main/resources/data/jd_roles_responsibility_corpus_v0.9.csv'
   
all_headlines = pd.read_csv(corpus_file_path, names = headers, sep = '\t',nrows = nrows)       

#Print lenght of all headlines from corpus
len(all_headlines)

1000

In [4]:
all_headlines

Unnamed: 0,Description
0,"Working within an agile environment, the Senio..."
1,Define & implement Policies & SOPs
2,"Monitor operations, develop and report quality..."
3,", Data Services"
4,Data Dictionary
...,...
995,Expert in building data processing systems wit...
996,Proficient in designing efficient and robust E...
997,Proficiency in PySpark
998,Good knowledge of SQL and MS Excel


In [5]:
#List description text from corpus
corpus_header = "Description"
description_texts = list(set(all_headlines[corpus_header]))
print ("Description Text:\n")
print (description_texts)

Description Text:



In [6]:
len(description_texts)

978

In [7]:
#Clear the corpus text by removing any punctuation marks etc
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [x for x in description_texts]
corpus[:100]

['Marketing analytics experience a huge plus!, ',
 'Conduct data analysis and analytics using multiple techniques and approaches - Descriptive, Diagnostic, Predictive and/or Prescriptive analytics.Communicate effectively with both technical and nontechnical stakeholders.Own deliverables from ideation to operating in production, i.e., ‚ÄúIf you build it, you run it.‚Äù, ',
 'Bachelors Degree and 5+ years of work experience',
 'Research and develop Deep learning architectures and algorithms',
 'Collaborate with regional POCs to understand KPI and data requirements',
 'Strong interpersonal skills and ability to interact seamlessly with all levels of management',
 'Excellent oral and written communication skills with the ability to manage multiple projects simultaneously',
 'Maintaining a list of CU system users according to a pre-identified number of allocated CU system licenses for each eComm application',
 'Develop and lead projects that produce substantive, grounded casework, ',
 'Docu

In [8]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
# inp_sequences[:10]
inp_sequences

[[185, 19],
 [185, 19, 8],
 [185, 19, 8, 9],
 [185, 19, 8, 9, 1599],
 [185, 19, 8, 9, 1599, 126],
 [276, 3],
 [276, 3, 20],
 [276, 3, 20, 1],
 [276, 3, 20, 1, 19],
 [276, 3, 20, 1, 19, 52],
 [276, 3, 20, 1, 19, 52, 115],
 [276, 3, 20, 1, 19, 52, 115, 92],
 [276, 3, 20, 1, 19, 52, 115, 92, 1],
 [276, 3, 20, 1, 19, 52, 115, 92, 1, 437],
 [276, 3, 20, 1, 19, 52, 115, 92, 1, 437, 622],
 [276, 3, 20, 1, 19, 52, 115, 92, 1, 437, 622, 2286],
 [276, 3, 20, 1, 19, 52, 115, 92, 1, 437, 622, 2286, 129],
 [276, 3, 20, 1, 19, 52, 115, 92, 1, 437, 622, 2286, 129, 1],
 [276, 3, 20, 1, 19, 52, 115, 92, 1, 437, 622, 2286, 129, 1, 10],
 [276, 3, 20, 1, 19, 52, 115, 92, 1, 437, 622, 2286, 129, 1, 10, 1600],
 [276, 3, 20, 1, 19, 52, 115, 92, 1, 437, 622, 2286, 129, 1, 10, 1600, 19],
 [276,
  3,
  20,
  1,
  19,
  52,
  115,
  92,
  1,
  437,
  622,
  2286,
  129,
  1,
  10,
  1600,
  19,
  224],
 [276,
  3,
  20,
  1,
  19,
  52,
  115,
  92,
  1,
  437,
  622,
  2286,
  129,
  1,
  10,
  1600,
  19,
  22

In [10]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
print(max_sequence_len)

1000


In [11]:
from keras.layers import Activation, Embedding, Masking, Dense, SimpleRNN, Dropout
from keras.models import Sequential
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    model.add(SimpleRNN(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

rnn_model = create_model(max_sequence_len, total_words)
rnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 999, 10)           43360     
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 100)               11100     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 4336)              437936    
Total params: 492,396
Trainable params: 492,396
Non-trainable params: 0
_________________________________________________________________


In [None]:
rnn_model.fit(predictors, label, epochs=10, verbose=5)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25


In [12]:
from keras.models import load_model
model_path = '/content/drive/MyDrive/nlp-job-generator/app/main/resources/models/rnn/rnn_model.h5'
rnn_model.save(model_path, overwrite=True, include_optimizer=True)  # creates a HDF5 file 'my_model.h5'

In [14]:
!pip install pydantic

Collecting pydantic
[?25l  Downloading https://files.pythonhosted.org/packages/2b/a3/0ffdb6c63f45f10d19b8e8b32670b22ed089cafb29732f6bf8ce518821fb/pydantic-1.8.1-cp37-cp37m-manylinux2014_x86_64.whl (10.1MB)
[K     |████████████████████████████████| 10.1MB 8.0MB/s 
Installing collected packages: pydantic
Successfully installed pydantic-1.8.1


In [15]:
from pydantic import BaseModel

class requestObject(BaseModel):
  seedText: str
  nextWords: int

In [16]:
def generateText(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [17]:
print (generateText("Monitor operations", 100, rnn_model, max_sequence_len))


Monitor Operations Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Stunning Stunning Standing Improvement Improvement World'S Must Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Accelerator Symbolic Symbolic Symbolic Symbolic World'S Verbal World'S Symbolic Speaking Symbolic Solid Institute Facilitating Symbolic Strives Allowed Crime Possession Allowed Dtr Supports Symbolic Designers Fortune‚Äôs Institute Exceptional Specialists Strives Monitoring Symbolic 123 Platforms Typically Groups Beyond Symbolic Scorecard Spec World'S Protocols Verbal Different Outcomes Charts Geo Made Linc‚Äôs Changing Paced Omniture Picture Offices Receipt Recommendationsdrives Speaking Suite Monitoring Architect Complex Omniture Appreciation Optoro'S Adobe Degree Exhibits Sponsored Observational Countries Matters Drinks Perceive Whether Hypothesis Symbolic Oracle Id Nyc Normal Emerging


In [None]:
#Install colabcode and fastapi
!pip install colabcode
!pip install fastapi

In [20]:
#import libraries for loading saved model, fast api, colabcode
import tensorflow as tf
from tensorflow import keras
from colabcode import ColabCode
from fastapi import FastAPI
from keras.models import load_model

import logging
from fastapi import FastAPI

app=FastAPI(title="NlpJdGeneratorAPI", description="NLP based RNN model Job Description Generator")

#initializing logging
my_logger = logging.getLogger()
my_logger.setLevel(logging.DEBUG)
logging.basicConfig(level=logging.DEBUG, filename='rnn_logs.log')

#Initalize lstml model to load and model file path
lstm_model_loaded = None
rnn_model_path = '/content/drive/MyDrive/nlp-job-generator/app/main/resources/models/rnn/rnn_model.h5'
max_sequence_length = 1000

@app.on_event("startup")
#Returns a compiled model identical to the saved after trained
def load_saved_model():
  global rnn_model_loaded
  rnn_model_loaded = tf.keras.models.load_model(rnn_model_path)

@app.post("/api")
async def getJobDescription(request:requestObject):
  try:
    print(request)
    my_logger.debug("request:", request)

    prediction = generateText(request.seedText, request.nextWords, rnn_model_loaded, max_sequence_length)
    my_logger.debug("prediction:", prediction)
    print(prediction)

    return {"job_description" : prediction}
  except:
    my_logger.error("Someting went wrong!")
    return {"prediction": "error"}

In [None]:
from colabcode import ColabCode
from fastapi import FastAPI
cc = ColabCode(port=1200, code=False, authtoken="1qhOBp2p5qxw80yQipxR0JHwMbl_5choeHxGXkp6HVXZ66hTh")
cc.run_app(app=app)

Public URL: NgrokTunnel: "http://9a4c0182b439.ngrok.io" -> "http://localhost:1200"


INFO:     Started server process [58]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:1200 (Press CTRL+C to quit)


INFO:     2409:4071:210e:60da:7106:94b7:c69c:3347:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     2409:4071:210e:60da:7106:94b7:c69c:3347:0 - "GET /openapi.json HTTP/1.1" 200 OK
INFO:     182.79.221.199:0 - "GET /docs HTTP/1.1" 200 OK
seedText='Monitor operations' nextWords=100


--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.7/logging/__init__.py", line 1025, in emit
    msg = self.format(record)
  File "/usr/lib/python3.7/logging/__init__.py", line 869, in format
    return fmt.format(record)
  File "/usr/lib/python3.7/logging/__init__.py", line 608, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.7/logging/__init__.py", line 369, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.7/dist-packages/traitlets/config/application.py", line 845, in launch_instance
    app.start()
  File "/usr/local/lib/pytho

Monitor Operations Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Stunning Stunning Standing Improvement Improvement World'S Must Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Accelerator Symbolic Symbolic Symbolic Symbolic World'S Verbal World'S Symbolic Speaking Symbolic Solid Institute Facilitating Symbolic Strives Allowed Crime Possession Allowed Dtr Supports Symbolic Designers Fortune‚Äôs Institute Exceptional Specialists Strives Monitoring Symbolic 123 Platforms Typically Groups Beyond Symbolic Scorecard Spec World'S Protocols Verbal Different Outcomes Charts Geo Made Linc‚Äôs Changing Paced Omniture Picture Offices Receipt Recommendationsdrives Speaking Suite Monitoring Architect Complex Omniture Appreciation Optoro'S Adobe Degree Exhibits Sponsored Observational Countries Matters Drinks Perceive Whether Hypothesis Symbolic Oracle Id Nyc Normal Emerging
INFO:     2409:4071:210e:60da:7106:94b7:c69c:3347:0 - "POST /api HTTP/1.1" 200 OK


--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.7/logging/__init__.py", line 1025, in emit
    msg = self.format(record)
  File "/usr/lib/python3.7/logging/__init__.py", line 869, in format
    return fmt.format(record)
  File "/usr/lib/python3.7/logging/__init__.py", line 608, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.7/logging/__init__.py", line 369, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.7/dist-packages/traitlets/config/application.py", line 845, in launch_instance
    app.start()
  File "/usr/local/lib/pytho

seedText='Monitor operations' nextWords=100


--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.7/logging/__init__.py", line 1025, in emit
    msg = self.format(record)
  File "/usr/lib/python3.7/logging/__init__.py", line 869, in format
    return fmt.format(record)
  File "/usr/lib/python3.7/logging/__init__.py", line 608, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.7/logging/__init__.py", line 369, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.7/dist-packages/traitlets/config/application.py", line 845, in launch_instance
    app.start()
  File "/usr/local/lib/pytho

Monitor Operations Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Stunning Stunning Standing Improvement Improvement World'S Must Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Accelerator Symbolic Symbolic Symbolic Symbolic World'S Verbal World'S Symbolic Speaking Symbolic Solid Institute Facilitating Symbolic Strives Allowed Crime Possession Allowed Dtr Supports Symbolic Designers Fortune‚Äôs Institute Exceptional Specialists Strives Monitoring Symbolic 123 Platforms Typically Groups Beyond Symbolic Scorecard Spec World'S Protocols Verbal Different Outcomes Charts Geo Made Linc‚Äôs Changing Paced Omniture Picture Offices Receipt Recommendationsdrives Speaking Suite Monitoring Architect Complex Omniture Appreciation Optoro'S Adobe Degree Exhibits Sponsored Observational Countries Matters Drinks Perceive Whether Hypothesis Symbolic Oracle Id Nyc Normal Emerging
INFO:     2409:4071:210e:60da:7106:94b7:c69c:3347:0 - "POST /api HTTP/1.1" 200 OK


--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.7/logging/__init__.py", line 1025, in emit
    msg = self.format(record)
  File "/usr/lib/python3.7/logging/__init__.py", line 869, in format
    return fmt.format(record)
  File "/usr/lib/python3.7/logging/__init__.py", line 608, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.7/logging/__init__.py", line 369, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.7/dist-packages/traitlets/config/application.py", line 845, in launch_instance
    app.start()
  File "/usr/local/lib/pytho