In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import random
from numpy.random import seed
random.set_seed(2)
seed(1)
import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
#Configuration parameters

#Add header in the company-corpus
headers = ["Description"]

#Number of rows to be selected from corpus    
nrows=1000 #for testing set 100, but actually it should be 1000

#corpus file path
corpus_file_path = '/content/drive/MyDrive/nlp-job-generator/app/main/resources/data/jd_company_corpus_v1.0.csv'
   
all_headlines = pd.read_csv(corpus_file_path, names = headers, sep = '\t',nrows = nrows)       

#Print lenght of all headlines from corpus
len(all_headlines)

1000

In [6]:
all_headlines

Unnamed: 0,Description
0,Learn more about Splunk careers and how you ca...
1,The Data Scientist role involves working on al...
2,Experience using one or more advanced analytic...
3,Experience with one or more data storage and m...
4,Demonstrates the ability to transform ambiguou...
...,...
995,Partner with your product and development peer...
996,Manage testing and debugging of analytics/tagg...
997,Assist in reporting and ad-hoc analysis of dat...
998,"Assist peer groups in the development of, and ..."


In [7]:
#List description text from corpus
corpus_header = "Description"
description_texts = list(set(all_headlines[corpus_header]))
print ("Description Text:\n")
print (description_texts)

Description Text:

['Must have a minimum of 5 years‚Äô experience with large-scale data manipulation, analytic tools, and data visualization', 'MS in Applied Mathematics, Statistics, or Computer Science - PhD desired', 'Strong research interest and experience with design of experiments, randomized control trials, and inference, particularly aspects of high throughput testing such as multiple hypothesis testing, sequential testing, robustness, data mining of experiments, ', 'Experience with clickstream tools including Adobe Analytics / Omniture, Google Analytics or Optimizely', 'Masters Degree in Statistics, Marketing Analytics, or other Quantitative fields is highly preferable, ', '2+ years of experience with one or more scripting or scientific languages, including Python, R, C++ or Java', 'Common NLP techniques, such as, ', 'Working knowledge of Tableau ‚Äì a plus', 'SQL', 'Experience to analyze data to identify deliverables, gaps, and inconsistencies]"', 'Retrieve and analyze data fr

In [8]:
len(description_texts)

955

In [None]:
#Clear the corpus text by removing any punctuation marks etc
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [x for x in description_texts]
corpus[:100]

In [None]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
# inp_sequences[:10]
inp_sequences

In [11]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
print(max_sequence_len)

1129


In [13]:
from keras.layers import Activation, Embedding, Masking, Dense, SimpleRNN, Dropout
from keras.models import Sequential
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    model.add(SimpleRNN(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

rnn_company_model = create_model(max_sequence_len, total_words)
rnn_company_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1128, 10)          45870     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 100)               11100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 4587)              463287    
Total params: 520,257
Trainable params: 520,257
Non-trainable params: 0
_________________________________________________________________


In [14]:
rnn_company_model.fit(predictors, label, epochs=25, verbose=5)


Epoch 1/25


KeyboardInterrupt: ignored

In [None]:
from keras.models import load_model
model_path = '/content/drive/MyDrive/nlp-job-generator/app/main/resources/models/rnn/rnn_company_model.h5'
rnn_model.save(model_path, overwrite=True, include_optimizer=True)  # creates a HDF5 file 'my_model.h5'

In [None]:
!pip install pydantic

Collecting pydantic
[?25l  Downloading https://files.pythonhosted.org/packages/2b/a3/0ffdb6c63f45f10d19b8e8b32670b22ed089cafb29732f6bf8ce518821fb/pydantic-1.8.1-cp37-cp37m-manylinux2014_x86_64.whl (10.1MB)
[K     |████████████████████████████████| 10.1MB 8.0MB/s 
Installing collected packages: pydantic
Successfully installed pydantic-1.8.1


In [None]:
from pydantic import BaseModel

class requestObject(BaseModel):
  seedText: str
  nextWords: int

In [None]:
def generateText(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generateText("Monitor operations", 100, rnn_model, max_sequence_len))


Monitor Operations Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Stunning Stunning Standing Improvement Improvement World'S Must Symbolic Symbolic Symbolic Symbolic Symbolic Symbolic Accelerator Symbolic Symbolic Symbolic Symbolic World'S Verbal World'S Symbolic Speaking Symbolic Solid Institute Facilitating Symbolic Strives Allowed Crime Possession Allowed Dtr Supports Symbolic Designers Fortune‚Äôs Institute Exceptional Specialists Strives Monitoring Symbolic 123 Platforms Typically Groups Beyond Symbolic Scorecard Spec World'S Protocols Verbal Different Outcomes Charts Geo Made Linc‚Äôs Changing Paced Omniture Picture Offices Receipt Recommendationsdrives Speaking Suite Monitoring Architect Complex Omniture Appreciation Optoro'S Adobe Degree Exhibits Sponsored Observational Countries Matters Drinks Perceive Whether Hypothesis Symbolic Oracle Id Nyc Normal Emerging


In [None]:
#Install colabcode and fastapi
!pip install colabcode
!pip install fastapi

In [None]:
#import libraries for loading saved model, fast api, colabcode
import tensorflow as tf
from tensorflow import keras
from colabcode import ColabCode
from fastapi import FastAPI
from keras.models import load_model

import logging
from fastapi import FastAPI

app=FastAPI(title="NlpJdGeneratorAPI", description="NLP based RNN model Job Description Generator")

#initializing logging
my_logger = logging.getLogger()
my_logger.setLevel(logging.DEBUG)
logging.basicConfig(level=logging.DEBUG, filename='rnn_logs.log')

#Initalize lstml model to load and model file path
rnn_company_model = None
rnn_company_model_path = '/content/drive/MyDrive/nlp-job-generator/app/main/resources/models/rnn/rnn_company_model.h5'
max_sequence_length = 1000

@app.on_event("startup")
#Returns a compiled model identical to the saved after trained
def load_saved_model():
  global rnn_company_model
  rnn_company_model = tf.keras.models.load_model(rnn_model_path)

@app.post("/api")
async def getJobDescription(request:requestObject):
  try:
    print(request)
    my_logger.debug("request:", request)

    prediction = generateText(request.seedText, request.nextWords, rnn_company_model, max_sequence_length)
    my_logger.debug("prediction:", prediction)
    print(prediction)

    return {"job_description" : prediction}
  except:
    my_logger.error("Someting went wrong!")
    return {"prediction": "error"}

In [None]:
from colabcode import ColabCode
from fastapi import FastAPI
cc = ColabCode(port=1200, code=False, authtoken="1qhOBp2p5qxw80yQipxR0JHwMbl_5choeHxGXkp6HVXZ66hTh")
cc.run_app(app=app)