## **Job Skill Extraction for all Job Market Data in Project**
- This program will extract job skills from all job posts across all job market platforms (LinkedIn, Workoplis, Indeed and Glassdoor)
- This serves as a demonstration of how the job skills are extracted from job posts before using job_skill_extraction_model_prediction.py

In [None]:
!pip install contractions
!pip install bert-for-tf2
!pip install tqdm
!pip install tensorflow_hub



In [None]:
import pandas as pd 
import numpy as np

# Upgrade from TF1 to TF2 for bert implementation
# tensorflow_version 2.8.0
import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow import keras
from tensorflow.keras.layers import Input

### Read Job Posts

In [None]:
colab_used = True
if colab_used:
    from google.colab import drive
    drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
input_path_colab = "/content/drive/MyDrive/SFU/CMPT733/project/"
input_path_lab = ""

input_path = input_path_colab if colab_used else input_path_lab

input_file_folder = "job_market_data/linkedin/"

input_file = "linkedIn_records_2022-03-05 185945.csv"

input_model_folder = "model/"
input_model = "job_skill_extraction_bert.h5"

### Pre-processing Job Posts

In [None]:
!pip install contractions



In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
import contractions
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

!python -m spacy download en_core_web_lg

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 248 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
import spacy
from spacy import displacy
from collections import Counter
from spacy.matcher import Matcher
from spacy.util import filter_spans
from spacy.tokens import Span
import en_core_web_lg
nlp = en_core_web_lg.load()

In [None]:
def data_preparation_preprocessing(df):
    # Data Pre-processing: most processing / transformation part has already done in web scrapping
    df['description'] = df['description'].apply(lambda x: contractions.fix(str(x)))
    import regex as re
    df['description'] = df['description'].str.lower()
    df['description'] = df['description'].apply(lambda x: re.sub(r'[^-\w\d\s\n\.\!\?\;\,\']+', '', x))
    #df['description'] = df['description'].apply(lambda x: re.sub(r'[\.\!\?\;\']+', '.', x))
    df['description'] = df['description'].apply(lambda x: re.sub(r'\n+', '.', x))
    # amp to & and "andor" to "and or"
    df['description'] = df['description'].str.replace(r'\bandor\b', ' and or ')
    df['description'] = df['description'].str.replace(r'\bamp\b', ' and ')
    #df['sentence'] = df['description'].str.split("\n")

    return df

In [None]:
def data_preparation_postprocessing(df):

    df['text'] = df.text.str.strip().str.strip(".").str.strip().str.strip(",").str.strip().str.strip("-").str.strip()
    df['text'] = df.text.str.strip().str.strip(".").str.strip().str.strip(",").str.strip().str.strip("-").str.strip()

    df['text'] = df['text'].apply(lambda s: s.replace("a ", '', 1) if s.startswith("a ") else s)
    df['text'] = df['text'].apply(lambda s: s.replace("an ", '', 1) if s.startswith("an ") else s)
    df['text'] = df['text'].apply(lambda s: s.replace("this ", '', 1) if s.startswith("this ") else s)
    df['text'] = df['text'].apply(lambda s: s.replace("the ", '', 1) if s.startswith("the ") else s)
    df['text'] = df['text'].apply(lambda s: s[0:len(s)-2] if s.endswith(" a") else s)
    df['text'] = df['text'].apply(lambda s: s[0:len(s)-3] if s.endswith(" an") else s)
    import string
    df['text'] = df['text'].apply(lambda s: ''.join(filter(lambda x: x in string.printable, s)))
    df['text'] = df['text'].apply(lambda s: s.replace(".", ' '))

    df['text'] = df['text'].apply(lambda s: s.replace("a ", '', 1) if s.startswith("a ") else s)
    df['text'] = df['text'].apply(lambda s: s.replace("an ", '', 1) if s.startswith("an ") else s)
    df['text'] = df['text'].apply(lambda s: s.replace("this ", '', 1) if s.startswith("this ") else s)
    df['text'] = df['text'].apply(lambda s: s.replace("the ", '', 1) if s.startswith("the ") else s)
    df['text'] = df['text'].apply(lambda s: s[0:len(s)-2] if s.endswith(" a") else s)
    df['text'] = df['text'].apply(lambda s: s[0:len(s)-3] if s.endswith(" an") else s)
    df['text'] = df['text'].apply(lambda s: ''.join(filter(lambda x: x in string.printable, s)))
    df['text'] = df['text'].apply(lambda s: s.replace(".", ' '))

    return df

### Extraction of Identified POS Patterns

In [None]:

experience_qualifiers = ['previous', 'prior', 'following', 'recent', 'the above', 'past',
                         
                         'proven', 'demonstrable', 'demonstrated', 'relevant', 'significant', 'practical',
                         'essential', 'equivalent', 'desirable', 'required', 'considerable', 'similar',
                         'working', 'specific', 'qualified', 'direct', 'hands on', 'handson', 
                         
                         'strong', 'solid', 'good', 'substantial', 'excellent', 'the right', 'valuable', 'invaluable',
                         
                         'some', 'any', 'none', 'much', 'extensive', 'no', 'more',
                         'your', 'their',
                         'years', 'months',
                         'uk',
                        ]

stopwords = ['a', 'an', '*', '**', '•', 'this', 'the', ':', 'Skills']

experience_qualifier_pattern = rf'\b(?:{"|".join(experience_qualifiers)})\b'

EXP_TERMS = ['experience', 'experienced', 'expertise', 'expert', 'familiar', 'familiarity', 'ability', 'able', 'required', 'is required', 'knowledge', 'understanding']

# extract noun phrase to the left of the keywords using SpaCy's noun_chunks
def extract_noun_phrase(doc, label_list=None):
    label_list = ['experience'] if label_list is None else label_list
    for item in label_list:    
        label = item
        for np in doc.noun_chunks:
            if np[-1].lower_ == label.lower():
                if len(np) > 1:
                    yield label.upper(), np[0].i, np[-1].i
# extract by looking to the right for a preposition (e.g. in/with) and then looking for its object and extracting the whole left subtree
def extract_adp(doc, label_list=None):
    label_list = ['experience'] if label_list is None else label_list
    for item in label_list:
        label = item
        for tok in doc:
            if tok.lower_ == label.lower():
                for child in tok.rights:
                    if child.dep_ == 'prep':
                        for obj in child.children:
                            if obj.dep_ == 'pobj':
                                yield label.upper(), obj.left_edge.i, obj.i+1
# extract by looking for a phrase like "Experience in/with/using" and then the noun phrase
def extract_adp_2(doc, label_list=None):
    label_list = ['experience'] if label_list is None else label_list
    for item in label_list:
        label = item
        for np in doc.noun_chunks:
            start_tok = np[0].i
            if start_tok >= 2 and doc[start_tok - 2].lower_ == label.lower() and doc[start_tok - 1].pos_ == 'ADP':
                yield label.upper(), start_tok, start_tok + len(np)
def get_conjugations(tok):
    new = [tok]
    while new:
        tok = new.pop()
        yield tok
        for child in tok.children:
            if child.dep_ == 'conj':
                new.append(child)
def get_left_span(tok, label='', include=True):
    offset = 1 if include else 0
    idx = tok.i
    while idx > tok.left_edge.i:
        if tok.doc[idx - 1].pos_ in ('NOUN', 'PROPN', 'ADJ', 'X', 'VERB'):
            idx -= 1
        else:
            break
    return label, idx, tok.i+offset
# expanding conjugations
def extract_adp_conj(doc, label_list=['experience']):
    for item in label_list:
        label = item
        for tok in doc:
            if tok.lower_ in EXP_TERMS:
                for child in tok.rights:
                    if child.dep_ == 'prep':
                        for obj in child.children:
                            if obj.dep_ == 'pobj':
                                for conj in get_conjugations(obj):
                                    yield get_left_span(conj, label.upper())
# extract verb followed by an adposition followed by the Noun, e.g Experience dealing with business clients
def extract_verb_maybeadj_noun(doc, label_list=None):
    label_list = ['experience'] if label_list is None else label_list
    for item in label_list:
        label = item
        for tok in doc:
            if tok.lower_ in EXP_TERMS:
                for child in tok.rights:
                    if child.dep_ == 'acl':
                        for gc in child.children:
                            if gc.dep_ == 'prep':
                                for ggc in gc.children:
                                    if ggc.dep_ == 'pobj':
                                        for c in get_conjugations(ggc):
                                            yield get_left_span(c, label.upper())
                            elif gc.dep_ == 'dobj':
                                for c in get_conjugations(gc):
                                    yield get_left_span(c, label.upper())




In [None]:
def extract_df(df, *extractors, n_max=None, **kwargs):
    if n_max is None:
        n_max = len(df)
    ent_df = pd.DataFrame(list(get_extractions(df.description[:n_max], *extractors, **kwargs)),
                          columns=['text', 'docidx', 'start', 'end', 'label', 'sent_start', 'sent_end'])
    return ent_df.merge(df, how='left', left_on='docidx', right_index=True)
def get_extractions(examples, *extractors, **kwargs):
    # Could use context instead of enumerate
    for idx, doc in enumerate(nlp.pipe(examples, batch_size=100, disable=['ner'])):
        for ent in filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc, **kwargs)]):
            sent = ent.root.sent
            yield ent.text, idx, ent.start, ent.end, ent.label_, sent.start, sent.end


### Loading the model building functions

Tokenization

In [None]:
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """
    pass

In [None]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    BertTokenizer = bert.bert_tokenization.FullTokenizer
    bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                                trainable=False)
    vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

    return tokenizer

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, 1),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(str(text)), text_b=None, label=label)
        )
    return InputExamples

Embedding

In [None]:
class BertModel(object):
    
    def __init__(self):
        
        self.max_len = 18
        bert_path = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
        FullTokenizer=bert.bert_tokenization.FullTokenizer
        
        self.bert_module = hub.KerasLayer(bert_path,trainable=True)

        self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()

        self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()

        self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)
        
    def get_masks(self,tokens, max_seq_length):
        return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

    def get_segments(self,tokens, max_seq_length):
        """Segments: 0 for the first sequence, 1 for the second"""
        segments = []
        current_segment_id = 0
        for token in tokens:
            segments.append(current_segment_id)
            if token == "[SEP]":
                current_segment_id = 1
        return segments + [0] * (max_seq_length - len(tokens))
    
    def get_ids(self,tokens, tokenizer, max_seq_length):
        """Token ids from Tokenizer vocab"""
        token_ids = tokenizer.convert_tokens_to_ids(tokens,)
        input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
        return input_ids
    def create_single_input(self,sentence,maxlen):

        stokens = self.tokenizer.tokenize(sentence)

        stokens = stokens[:maxlen]

        stokens = ["[CLS]"] + stokens + ["[SEP]"]

        ids = self.get_ids(stokens, self.tokenizer, self.max_len)
        masks = self.get_masks(stokens, self.max_len)
        segments = self.get_segments(stokens, self.max_len)

        return ids,masks,segments

    def create_input_array(self,sentences):
        
        input_ids, input_masks, input_segments = [], [], []

        for sentence in tqdm_notebook(sentences,position=0, leave=True):
            ids,masks,segments=self.create_single_input(sentence,self.max_len-2)

            input_ids.append(ids)
            input_masks.append(masks)
            input_segments.append(segments)
            
        tensor = [np.asarray(input_ids, dtype=np.int32), 
                np.asarray(input_masks, dtype=np.int32), 
                np.asarray(input_segments, dtype=np.int32)]
        return tensor

In [None]:
class PreprocessingBertData():
    
    def prepare_data_x(self,train_sentences):
        x = bert_model_obj.create_input_array(train_sentences)
        return x
    
    def prepare_data_y(self,train_labels):
        y = list()
        for item in train_labels:
            label = item
            y.append(label)
        y = np.array(y)
        return y

Model Building

In [None]:
class NetworkModel():
    def __init__(self):
        self.model = None        
        #self.train_data = [train_input_ids, train_input_masks, train_segment_ids]
        #self.train_labels = train_labels
        #self.test_data = [test_input_ids, test_input_masks, test_segment_ids]
        #self.test_labels = test_labels
        
    def bert_model(self,max_seq_length): 
        in_id = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_ids")
        in_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_masks")
        in_segment = Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
        
        bert_inputs = [in_id, in_mask, in_segment]

        pooled_output, sequence_output = bert_model_obj.bert_module(bert_inputs)
        """
        #x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
        x = tf.keras.layers.SpatialDropout1D(0.2)(sequence_output)
        x = tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        pred = tf.keras.layers.Dense(units=1, activation='sigmoid')(x)
        optimizer = tf.keras.optimizers.Adam(lr=0.00001)
        self.model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
        self.model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        """
        
        x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
        x = tf.keras.layers.Dense(128, activation='relu')(x)
        x = tf.keras.layers.Dense(64, activation='relu')(x)
        x = tf.keras.layers.Dense(32, activation='relu')(x)
        x = tf.keras.layers.Dense(24, activation='tanh')(x)
        x = tf.keras.layers.Dense(12, activation='relu')(x)
        pred = tf.keras.layers.Dense(1, activation='softplus')(x)
        optimizer = tf.keras.optimizers.Adam(lr=0.00001)
        self.model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
        self.model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        
        
        
        self.model.summary()
    
    def model_train(self,batch_size,num_epoch):
        print("Fitting to Model")
        
        self.model.fit(self.train_data,self.train_labels,epochs=num_epoch,batch_size=batch_size,validation_split=0.2,shuffle=True)
        
        print("Model Training Complete.")
    
    def model_test(self):
        print("Model Testing on Unseen Data")

        self.model.evaluate(self.test_data, self.test_labels)
        
        print("Model Testing Complete.")

    def save_model(self,model_folder,model_name):    
        self.model.save(model_name+".h5", overwrite=True, include_optimizer=True)
        print("Model saved to Model folder.")

    def get_model(self):    
        return self.model
    
    def load_weights(self, path):
        self.model.load_weights(path)
        print("Model weights loaded.")

### Loading the Trained Model

In [None]:
# Build and Load Model


model = None

bert_model_obj = BertModel()

model = NetworkModel()
model.bert_model(bert_model_obj.max_len)

# Load existing model
model.load_weights(input_path+input_model_folder+input_model)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 18)]         0           []                               
                                                                                                  
 input_masks (InputLayer)       [(None, 18)]         0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 18)]         0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_ids[0][0]',              
                                 (None, 18, 768)]                 'input_masks[0][0]',        

  super(Adam, self).__init__(name, **kwargs)


Model weights loaded.


### Putting Everything Together

In [None]:
# Read job posts

df = pd.read_csv(input_path+input_file_folder+input_file)
df.head()

# Pre-processing
source_df = data_preparation_preprocessing(df)

# Extract tokens
extract_exps = [extract_noun_phrase, extract_adp, extract_adp_2, extract_adp_conj, extract_verb_maybeadj_noun]
df_ents = extract_df(source_df, *extract_exps, n_max=source_df.shape[0], label_list=EXP_TERMS)
df_ents_valid = df_ents[(~df_ents.text.str.lower().str.contains(experience_qualifier_pattern)) & # Not a qualifier
                     ~df_ents.text.isin(stopwords)]

input_df = df_ents_valid[["text", "label", "job_title"]]

# Post-processing
input_df = data_preparation_postprocessing(input_df)

# Group text - just in case of any duplication in entries
consolidated_df = input_df[["text", "job_title"]].drop_duplicates()

input_text = consolidated_df.to_numpy()[:, np.newaxis]

# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
input_examples = convert_text_to_examples(input_text, np.zeros((len(input_text),1)))

# Convert to features
(input_ids, input_masks, segment_ids, labels 
) = convert_examples_to_features(tokenizer,
                                 input_examples,
                                 max_seq_length = 18)

# Extract job skills
save_preds = model.get_model().predict([input_ids, input_masks, segment_ids] )


predictions = pd.DataFrame(dict(list(zip(consolidated_df["text"].to_numpy(), save_preds))))



  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Converting examples to features:   0%|          | 0/4025 [00:00<?, ?it/s]

Unnamed: 0,0
activities maintain complete,1.057839
program management teams,1.141757
people management,0.983358
work,0.308582
complex technical information,0.856499
...,...
electrical utility safety rules,1.152985
ontario occupational health,1.075840
safe performance,0.858716
safe operation,0.844824


In [None]:
pd.options.display.max_rows = None

predictions.T


Unnamed: 0,0
activities maintain complete,1.057839
program management teams,1.141757
people management,0.983358
work,0.308582
complex technical information,0.856499
concise manner,0.745324
variety,0.514437
deep,0.364997
software development lifecycle,1.362817
scrum,1.290231


In [None]:
# Convert to desired format for saving
consolidated_df['prediction_prob'] = save_preds
consolidated_df['predicted_skill'] = (consolidated_df['prediction_prob'] >= 0.5).astype('int8')

consolidated_df

Unnamed: 0,text,job_title,predicted_skill,predicted_prob,prediction_prob
0,activities maintain complete,Assistant Human Resources Manager - (17638),1,1.057839,1.057839
1,program management teams,"Software Development Manager, RDS Custom",1,1.141757,1.141757
2,people management,"Software Development Manager, RDS Custom",1,0.983358,0.983358
3,work,"Software Development Manager, RDS Custom",0,0.454558,0.454558
6,complex technical information,"Software Development Manager, RDS Custom",1,0.856499,0.856499
7,concise manner,"Software Development Manager, RDS Custom",1,0.745324,0.745324
8,variety,"Software Development Manager, RDS Custom",0,0.430031,0.430031
9,deep,"Software Development Manager, RDS Custom",1,0.831816,0.831816
10,software development lifecycle,"Software Development Manager, RDS Custom",1,1.362817,1.362817
11,scrum,"Software Development Manager, RDS Custom",1,1.373351,1.373351


### Other References

In [None]:
def data_process(chunks):
    '''creates a dataframe that easily parses the chunks data 
    '''
    df = pd.DataFrame(chunks)    
    df.fillna('X', inplace = True)
    
    pattern = []
    for row in df.values:
        phrase = ''
        for tup in row:
            # needs a space at the end for seperation
            phrase += tup[0] + ' '
        phrase = ''.join(phrase)
        # could use padding tages but encoder method will provide during 
        # tokenizing/embeddings; X can replace paddding for now
        pattern.append( phrase.replace('X', '').strip())
    df['phrase'] = pattern
    # only returns 10% of each dataframe to be used 
    return df.phrase