In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import random
from numpy.random import seed
random.set_seed(2)
seed(1)
import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
from google.colab import files
from google.colab import drive
drive.mount('/content/gdrive/')
headers = ["Description"]
       
all_headlines = pd.read_csv("/content/gdrive/My Drive/jd_role_res_corpus_V1.csv", names = headers, sep='\t',nrows= 100)       
#all_headlines = pd.read_csv("test_res.csv", names = headers, sep='\t',nrows=100)
#all_headlines = pd.read_csv("clean_desc.csv", sep='\t',nrows=100)

#clean_desc.csv
# # all_headlines.extend(list(article_df.Description.values))
# all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


100

In [4]:
all_headlines

Unnamed: 0,Description
0,"Working within an agile environment, the Senio..."
1,Define & implement Policies & SOPs
2,"Monitor operations, develop and report quality..."
3,", Data Services"
4,Data Dictionary
...,...
95,Demonstrated ability to participate in require...
96,Excellent written and oral communication skills
97,","
98,"THE FOLLOWING IS DESIRED, BUT NOT REQUIRED TO ..."


In [5]:
texts = list(set(all_headlines['Description']))
print (texts)

['Attention to detail', ', Promotes and behaves in a fashion to support GeoDigital‚Äôs Cultural Pillars', 'Monitor operations, develop and report quality metrics to key stakeholders', 'Ensure data integrity across several data sources, reports, and dashboards', ', Work Environment, Works in a fast paced office environment with multiple priorities and competing demands; potential set-backs in project completion due to internal or external issues, resourcing and re-allocation.Comp Science, Math, Engineering) or related experience', 'Build and maintain data pipelines', 'Planimetric collection - Using CAD software to create vectors for mapping purposes', 'Data Lineage', 'Identify and recommend appropriate measures to manage and remediate vulnerabilities and reduce potential impacts on information resources to a level acceptable to the senior management of the company', 'Good knowledge of Python and/or R', 'Successful experience working in a fast-paced start-up work environment a plus', 'Ba

In [6]:
len(texts)

99

In [7]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [x for x in texts]
corpus[:100]

['Attention to detail',
 ', Promotes and behaves in a fashion to support GeoDigital‚Äôs Cultural Pillars',
 'Monitor operations, develop and report quality metrics to key stakeholders',
 'Ensure data integrity across several data sources, reports, and dashboards',
 ', Work Environment, Works in a fast paced office environment with multiple priorities and competing demands; potential set-backs in project completion due to internal or external issues, resourcing and re-allocation.Comp Science, Math, Engineering) or related experience',
 'Build and maintain data pipelines',
 'Planimetric collection - Using CAD software to create vectors for mapping purposes',
 'Data Lineage',
 'Identify and recommend appropriate measures to manage and remediate vulnerabilities and reduce potential impacts on information resources to a level acceptable to the senior management of the company',
 'Good knowledge of Python and/or R',
 'Successful experience working in a fast-paced start-up work environment a 

In [13]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
# inp_sequences[:10]
inp_sequences

[[217, 2],
 [217, 2, 218],
 [386, 1],
 [386, 1, 387],
 [386, 1, 387, 6],
 [386, 1, 387, 6, 11],
 [386, 1, 387, 6, 11, 388],
 [386, 1, 387, 6, 11, 388, 2],
 [386, 1, 387, 6, 11, 388, 2, 56],
 [386, 1, 387, 6, 11, 388, 2, 56, 389],
 [386, 1, 387, 6, 11, 388, 2, 56, 389, 390],
 [386, 1, 387, 6, 11, 388, 2, 56, 389, 390, 391],
 [392, 40],
 [392, 40, 34],
 [392, 40, 34, 1],
 [392, 40, 34, 1, 137],
 [392, 40, 34, 1, 137, 19],
 [392, 40, 34, 1, 137, 19, 21],
 [392, 40, 34, 1, 137, 19, 21, 2],
 [392, 40, 34, 1, 137, 19, 21, 2, 13],
 [392, 40, 34, 1, 137, 19, 21, 2, 13, 219],
 [96, 3],
 [96, 3, 393],
 [96, 3, 393, 220],
 [96, 3, 393, 220, 394],
 [96, 3, 393, 220, 394, 3],
 [96, 3, 393, 220, 394, 3, 41],
 [96, 3, 393, 220, 394, 3, 41, 57],
 [96, 3, 393, 220, 394, 3, 41, 57, 1],
 [96, 3, 393, 220, 394, 3, 41, 57, 1, 138],
 [16, 25],
 [16, 25, 221],
 [16, 25, 221, 6],
 [16, 25, 221, 6, 11],
 [16, 25, 221, 6, 11, 139],
 [16, 25, 221, 6, 11, 139, 140],
 [16, 25, 221, 6, 11, 139, 140, 222],
 [16, 25,

In [14]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [17]:
from keras.layers import Activation, Masking, Dense, SimpleRNN
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Dropout
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    model.add(SimpleRNN(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 660, 10)           9970      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 100)               11100     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 997)               100697    
Total params: 121,767
Trainable params: 121,767
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(predictors, label, epochs=50, verbose=5)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f926c886e90>

In [19]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [20]:
print (generate_text("Monitor operations", 100, model, max_sequence_len))


Monitor Operations Experience In A Latest In Geography Similarly Identify Two The Following Identify You'Ll Be Doing In A Quality And Or Within Systems Organization To Create Vectors For The Ta Or Or Usma Opportunity In A Next Roadmap Proposing What And Technology Opportunities For P G For The Ta Or Or Broadly Work Or Other Available The Product To Be Considered For The Following Four Or Usma Issues Of The City Of Customers Served Approximately 10 Week Summer Internship From May June To August 2019 Was App Snowflake Dbt And Tableauensure Compliance With Others Leads August 2019 Cadd Add Summer Internship


In [23]:
from sklearn.feature_extraction.text import CountVectorizer

def Convert(string):
    li = list(string.split(" "))
    return li
  
# Driver code  
docs = generate_text("Monitor operations", 50, model, max_sequence_len)  
docs1 = Convert(docs)

print(type(docs1))
docs2 = texts[:3]
print(type(docs2))

print(docs1)
print(docs2)

<class 'list'>
<class 'list'>
['Monitor', 'Operations', 'Experience', 'In', 'A', 'Latest', 'In', 'Geography', 'Similarly', 'Identify', 'Two', 'The', 'Following', 'Identify', "You'Ll", 'Be', 'Doing', 'In', 'A', 'Quality', 'And', 'Or', 'Within', 'Systems', 'Organization', 'To', 'Create', 'Vectors', 'For', 'The', 'Ta', 'Or', 'Or', 'Usma', 'Opportunity', 'In', 'A', 'Next', 'Roadmap', 'Proposing', 'What', 'And', 'Technology', 'Opportunities', 'For', 'P', 'G', 'For', 'The', 'Ta', 'Or', 'Or']
['Attention to detail', ', Promotes and behaves in a fashion to support GeoDigital‚Äôs Cultural Pillars', 'Monitor operations, develop and report quality metrics to key stakeholders']


In [24]:
def word2vec(word):
    from collections import Counter
    from math import sqrt

    # count the characters in word
    cw = Counter(word)
    # precomputes a set of the different characters
    sw = set(cw)
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))

    # return a tuple
    return cw, sw, lw

def cosdis(v1, v2):
    # which characters are common to the two words?
    common = v1[1].intersection(v2[1])
    # by definition of cosine distance we have
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]


threshold = 0.80     # if needed
for key in docs1:
    for word in docs2:
        try:
            # print(key)
            # print(word)
            res = cosdis(word2vec(word), word2vec(key))
            # print(res)
            print("The cosine similarity between : {} and : {} is: {}".format(word, key, res*100))
            # if res > threshold:
            #     print("Found a word with cosine distance > 80 : {} with original word: {}".format(word, key))
        except IndexError:
            pass

The cosine similarity between : Attention to detail and : Monitor is: 61.904761904761905
The cosine similarity between : , Promotes and behaves in a fashion to support GeoDigital‚Äôs Cultural Pillars and : Monitor is: 48.82403965335685
The cosine similarity between : Monitor operations, develop and report quality metrics to key stakeholders and : Monitor is: 62.13443569605167
The cosine similarity between : Attention to detail and : Operations is: 63.245553203367585
The cosine similarity between : , Promotes and behaves in a fashion to support GeoDigital‚Äôs Cultural Pillars and : Operations is: 67.08203932499369
The cosine similarity between : Monitor operations, develop and report quality metrics to key stakeholders and : Operations is: 74.87722607556702
The cosine similarity between : Attention to detail and : Experience is: 35.714285714285715
The cosine similarity between : , Promotes and behaves in a fashion to support GeoDigital‚Äôs Cultural Pillars and : Experience is: 32.829957