### 1. Preparation

 - Get connection to database
 - Extract approved skill from Mongo

In [90]:
import sys
import importlib
sys.path.insert(0, ".")

import sql_functions as sf
from sql_functions import *
importlib.reload(sf)

<module 'sql_functions' from 'C:\\Users\\KeikoGolden\\ComputationalLiguistics\\Annotaion\\bn_nb_s\\PythonTocheckModel\\.\\sql_functions.py'>

In [91]:
import yaml
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from psycopg2.extras import execute_values

with open("config.yml") as f:
    config = yaml.safe_load(f)

In [92]:
conn = psycopg2.connect(
        database=config['db']['name'], 
        user=config['db']['user'], 
        host=config['db']['host'], 
        password=config['db']['password']
    )

cur = conn.cursor()

In [93]:
import pandas as pd
import requests
from typing import List


class GetSkills:

    endpoint = "https://prodazure-ml-api.skyhive.io/data-preparation" \
                    "/entity/skill"
    skills = requests.get(endpoint).json()


    def filter_skills(self, lang="en") -> List[str]:
        """Load approved skills from a list of dictionaries

        Args:
            skills: a list of individual skill dictionaries

        Returns:
            selected_skills: a list of skill strings
        """
        selected_skills: List[str] = []
        #selected_defs = []
        for skill_dict in self.skills:
            approved = skill_dict.get("isApproved", False)
            if approved:
                val = skill_dict.get("title", None)
                if val is not None:
                    skill: str = val.get(lang, None)
                    #defin = skill_dict.get("definition", None).get("en")
                    #if skill is not None and defin is not None:
                    if skill is not None:
                        selected_skills.append(skill)
                        #selected_defs.append(defin)
        #return selected_skills, selected_defs
        return selected_skills

    #Get APPROVED skills and synonyms
    def get_skills_and_synonyms(self):
        selected_skills: List[str] = []
        selected_defs = []
        synonym_coll = []
        ids = []
        for skill_dict in self.skills:
            approved = skill_dict.get("isApproved", False)
            if approved:
                val = skill_dict.get("title", None)
                if val is not None:
                    skill: str = val.get('en', None)
                    defin = skill_dict.get("definition", None).get("en")
                    if skill is not None and defin is not None:
                        selected_skills.append(skill)
                        selected_defs.append(defin)
                        skill_id = skill_dict['id']
                        ids.append(skill_id)
                        synonyms = skill_dict['synonyms']
                        synonym_coll.append(synonyms)
        return pd.DataFrame({
            'id': ids,
            'skill': selected_skills,
            'synonyms': synonym_coll,
            'definition': selected_defs
        })

In [94]:
# Get APPROVED skills from Mongo.
approved_df = GetSkills().get_skills_and_synonyms()
approved_df

Unnamed: 0,id,skill,synonyms,definition
0,5ec5dd60bd86b3287e0f992d,.NET Assemblies,,Defined by Microsoft for use in recent version...
1,5b1b16e8a0ad036875697504,.NET Framework,".net; .NET, .Net",.NET Framework is a software framework develop...
2,5ec5ddb0bd86b3287e0f9938,.NET Reflector,,".NET Reflector is a class browser, decompiler ..."
3,5ec5ddb0ec223f743f884251,.NET Remoting,,.NET Remoting is a Microsoft application progr...
4,5ca66a3e06c206377e3c20eb,2 Way Radio,,A two-way radio is a radio that can both trans...
...,...,...,...,...
12445,5ec5e6c14d0455767ecf299b,Zone File,,A Domain Name System zone file is a text file ...
12446,5ebc13084d679d1f836681af,Zoning,Rezoning; Land Zoning,Zoning is a method of urban planning in which ...
12447,5a9cd81dfc0da84a0c1c28ac,Zoology,,Zoology is the branch of biology that studies ...
12448,5ec5e6c1bd86b3287e0ff00b,Zotero,,Zotero is a free and open-source reference man...


### 2. Use approved skill, create list of syntax patterns

 - Create syntax patterns using spacy.
 - Extract unique patterns to create a list of syntax patterns.

In [95]:
import spacy
import pandas as pd
nlp = spacy.load('en_core_web_lg')

In [96]:
def create_syntax_pattern(skill):
    doc = nlp(skill)
    pattern = []
    for token in doc:
        pattern.append(token.pos_ )
    return pattern

In [97]:
returned_pattern = []
for skill_title in approved_df.skill.tolist():
    results = create_syntax_pattern(skill_title)
    returned_pattern.append('-'.join(results))      

In [98]:
pattern_df = pd.DataFrame({'pattern':returned_pattern})
pattern_df

Unnamed: 0,pattern
0,PUNCT-NOUN
1,X-VERB
2,PUNCT-NOUN
3,VERB-NOUN
4,NUM-NOUN-NOUN
...,...
12445,PROPN-PROPN
12446,NOUN
12447,NOUN
12448,PROPN


In [99]:
unique_patterns = pattern_df['pattern'].unique()
unique_patterns

array(['PUNCT-NOUN', 'X-VERB', 'VERB-NOUN', 'NUM-NOUN-NOUN', 'NUM-NOUN',
       'PROPN-PROPN', 'PROPN-PROPN-PROPN-PROPN', 'PROPN-PROPN-PROPN',
       'PROPN-NOUN', 'NUM-PROPN-PROPN', 'PROPN',
       'PROPN-PROPN-PUNCT-NOUN-PUNCT', 'ADJ-NOUN', 'NOUN-PROPN',
       'ADJ-NOUN-PUNCT-NOUN-NOUN', 'ADJ-PROPN-NOUN', 'X-PUNCT',
       'NUM-PROPN-PROPN-PROPN', 'DET-PROPN-PROPN-PROPN-PUNCT-PROPN-PUNCT',
       'NOUN-SYM-NOUN-NOUN', 'PROPN-VERB', 'ADV', 'PROPN-ADP-NOUN',
       'NOUN', 'ADJ-PROPN', 'PROPN-PROPN-PROPN-NUM',
       'PROPN-SYM-PROPN-PROPN', 'NOUN-NOUN', 'PROPN-CCONJ-PROPN',
       'NOUN-PUNCT-VERB-NOUN', 'NOUN-ADJ', 'NOUN-PROPN-PROPN',
       'PROPN-PROPN-ADP-NOUN', 'PROPN-PROPN-PROPN-PROPN-NOUN',
       'PROPN-ADJ-NOUN', 'PROPN-PUNCT-PROPN', 'VERB',
       'PROPN-PROPN-PROPN-PROPN-PROPN',
       'ADJ-PROPN-PROPN-PUNCT-PROPN-PUNCT', 'PROPN-PROPN-PROPN-NOUN',
       'VERB-ADJ-NOUN', 'VERB-PROPN-PROPN-PROPN-PROPN-CCONJ-PROPN-NOUN',
       'VERB-PROPN', 'PROPN-PROPN-NOUN-NOUN', 'PROPN-P

# 3. Find unmatched syntax pattern skills in unapproved_client.parguet

- Using skills from unapproved_client.parquet, create syntax pattern.
- Match the syntax pattern against the one created at 2.
- If no match, store the skill in the non_sensical_skill list.

In [100]:
client_skill_df = pd.read_parquet("C:/Users/KeikoGolden/ComputationalLiguistics/KE218/unapproved_client.parquet")
client_skill_df

Unnamed: 0,skill,serial_id,_t,type,tool,client
11,Strategy,5a99f8eafc0da84a0c17cac1,SkillToolDictionaryEntity,2,0.0,"[{'ClientComplexity': {'ComplexityName': 'L3',..."
12,Physical Health Assessment,5a9bb429fc0da84a0c19462e,SkillToolDictionaryEntity,2,0.0,"[{'ClientComplexity': {'ComplexityName': 'L3',..."
13,Recruitment,5a99c346fc0da84a0c16c281,SkillToolDictionaryEntity,2,0.0,"[{'ClientComplexity': {'ComplexityName': 'L3',..."
14,Physically Fit Req,5a99e8e5fc0da84a0c16ecea,SkillToolDictionaryEntity,2,0.0,"[{'ClientComplexity': {'ComplexityName': 'L3',..."
15,Wiring,5a9c39e5fc0da84a0c1a7fa7,SkillToolDictionaryEntity,2,0.0,"[{'ClientComplexity': {'ComplexityName': 'L3',..."
...,...,...,...,...,...,...
52935,Project Work Plan Development,62d98bb61e31a01dd064382e,SkillToolDictionaryEntity,2,0.0,[]
52938,Pharmacy Sme,62d98c321e31a01dd0643b6d,SkillToolDictionaryEntity,2,0.0,[]
52943,Rapid7,62d9b1531e31a01dd064c130,SkillToolDictionaryEntity,2,0.0,[]
57525,Ab&c Regulations,6304f924dd3b4e9df8f3140c,SkillToolDictionaryEntity,2,0.0,[]


In [101]:
non_sensical_skills = []

for skill in client_skill_df.skill.tolist():
    results = create_syntax_pattern(skill) # Call the function create_syntax_pattern.
    returned_pattern2 = '-'.join(results)  # Using returned value, create syntax patterns
    if returned_pattern2 not in unique_patterns: # check syntax patterns agasint the list of unique patterns created at 2.
        non_sensical_skills.append(skill) # when the unmatched syntax pattern is detected, store the skill in the non_sensical_skills list

In [102]:
non_sensical_skills

['Serving Food and Beverage',
 'Receiving Food and Beverage Orders',
 'Administering Tests and Exams',
 'Multi-tasking',
 'Model-view-controller',
 '.NET Framework 2',
 'Administering Local Anesthetics',
 'Advertising and Promotions Co-ordinator',
 'Angular 6',
 "Commercial Driver's License",
 'HR-Software',
 'Mac and PC platforms',
 'Unreal 3',
 'Angular 7',
 'Persian (Farsi) Language',
 'It Support Technician',
 'Vp & Director of Corporate Development',
 'Executive Director & Co-owner',
 'Math and Science Teacher in High',
 'Sales Consultant - Part-time',
 'Telephone Interviewer/Market Research',
 'Teaching Assistant Dept. of Physics and Astronomy',
 "Therapeutic Services Manager - Shepherd's Care Foundation",
 'Public Health Inspector/Health Promotion Officer',
 'Product Quality Lead - Global Product and Marketing',
 'Inside Sales Rep & Web Content Administrator',
 'Lead Contact When Booking and Negotiating Large Food and Beverage Functions',
 'Executive Assistant to the Senior Vice

In [14]:
len(non_sensical_skills)

4892

# 4 Find gramatically incorrect titled skills

- Check the skill titles in unapproved_client.parquet agasint the grammar checker.

In [27]:
import language_tool_python

tool = language_tool_python.LanguageTool('en-US')

In [18]:
def find_incorrect_grammar(skill_title):
    
    gramatically_incorrect_skills = []
    matches = tool.check(skill_title)
    
    for match in matches:
        gramatically_incorrect_skills.append(match)
    
    return gramatically_incorrect_skills

In [19]:
# point check to see if the function above is working.
results = find_incorrect_grammar('I are an English teacher.')
results

[Match({'ruleId': 'PERS_PRONOUN_AGREEMENT', 'message': 'Did you mean “am” or “ate”?', 'replacements': ['am', 'ate'], 'offsetInContext': 2, 'context': 'I are an English teacher.', 'offset': 2, 'errorLength': 3, 'category': 'GRAMMAR', 'ruleIssueType': 'grammar', 'sentence': 'I are an English teacher.'})]

In [24]:
gramatically_incorrect = []
skills = []
for skill in client_skill_df.skill.tolist():
    match = find_incorrect_grammar(skill)
    if match:
        gramatically_incorrect.append(match)
        skills.append(skill)

In [25]:
skills

['Platemaking',
 'iStudio Publisher',
 'Lucidpress',
 'Xara Page & Layout Designer',
 'Fatpaint',
 'Scenari (software)',
 'Revit',
 'Pageplus',
 'Pagestream',
 'Xara Designer Pro X',
 'LibreCAD',
 'Nanodevice Simulation',
 'Mould-making',
 'Electrology',
 'Dethatching',
 'Laser Shearography',
 'Callisthenics Instruction',
 'Biomicroscope Operation',
 'Watercolour Painting',
 'Ophthalmoscope Operation',
 'Rolfing',
 'Dogsledding',
 'Multi-tasking',
 'Cpr Level C',
 'Aeroelasticity',
 'Ui Design',
 'Scooptram Operation',
 'Fortran Programming Language',
 'NASA OpenVSP',
 'Cics',
 'Orthopaedic Surgery',
 'Sql (Ansi)',
 'Coldfusion 8',
 'Ekg Technician',
 'RStudio Computer Program',
 'Crypto CurrencyAuto Trading Bot',
 'Testlink System Software',
 'Creative Problem Solving',
 'Axure Rp Software',
 'Operationalisation',
 'Advertising and Promotions Co-ordinator',
 'Complex Problem Solving',
 'User Datagram Protocol',
 'ClearCode',
 'Google Adwords',
 'Project Management Professional (Pmp)',

In [26]:
gramatically_incorrect

[[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['Plate making'], 'offsetInContext': 0, 'context': 'Platemaking', 'offset': 0, 'errorLength': 11, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'Platemaking'})],
 [Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['studio'], 'offsetInContext': 0, 'context': 'iStudio Publisher', 'offset': 0, 'errorLength': 7, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'iStudio Publisher'})],
 [Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['Lucid press'], 'offsetInContext': 0, 'context': 'Lucidpress', 'offset': 0, 'errorLength': 10, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'Lucidpress'})],
 [Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['Sara', 'Kara', 'Lara', 'M

# 5. Print out non-sensical titles and gramatically-incorrect titles

In [29]:
non_sensical_df = pd.DataFrame({'non_sensical_skill': non_sensical_skills})
non_sensical_df

Unnamed: 0,non_sensical_skill
0,Serving Food and Beverage
1,Receiving Food and Beverage Orders
2,Administering Tests and Exams
3,Multi-tasking
4,Model-view-controller
...,...
4887,Project and Portfolio Management (Ppm)
4888,"Hcfa, UB, Provider, Adjustment, Cod"
4889,"Ocr, Num, Crit, Repair"
4890,"Mail Room, Paper Claims"


In [30]:
incorrect_grammar_df = pd.DataFrame({'gramatically_incorrect_title':skills, 'reason':gramatically_incorrect})
incorrect_grammar_df

Unnamed: 0,gramatically_incorrect_title,reason
0,Platemaking,"[Offset 0, length 11, Rule ID: MORFOLOGIK_RULE..."
1,iStudio Publisher,"[Offset 0, length 7, Rule ID: MORFOLOGIK_RULE_..."
2,Lucidpress,"[Offset 0, length 10, Rule ID: MORFOLOGIK_RULE..."
3,Xara Page & Layout Designer,"[Offset 0, length 4, Rule ID: MORFOLOGIK_RULE_..."
4,Fatpaint,"[Offset 0, length 8, Rule ID: MORFOLOGIK_RULE_..."
...,...,...
6935,"Ocr, Num, Crit, Repair","[Offset 0, length 3, Rule ID: MORFOLOGIK_RULE_..."
6936,Optmizly,"[Offset 0, length 8, Rule ID: MORFOLOGIK_RULE_..."
6937,Restassured,"[Offset 0, length 11, Rule ID: MORFOLOGIK_RULE..."
6938,Pharmacy Sme,"[Offset 9, length 3, Rule ID: MORFOLOGIK_RULE_..."


In [31]:
import os
writer = pd.ExcelWriter('KE218_non_sensical_skills.xlsx', engine='xlsxwriter')
non_sensical_df.to_excel(writer, sheet_name="non_sensical", index=False)
incorrect_grammar_df.to_excel(writer, sheet_name="gramatically_incorrect", index=False)

In [32]:
writer.save()
writer.close()

  warn("Calling close() on already closed file.")


# 6. Extract laws

### Performance level
#####  1. Regex
#####  2. ML model
#####  3. Syntax pattern

In [103]:
command = """
select skills.skill,  
entity_types.entity_type as classification,
entity_types.id as classification_id
from entities
join entity_types on entities.entity_id = entity_types.id
join skills on skills.id = entities.skill_id
where entity_types.entity_type = 'law'
"""

entity_df = pd.read_sql(command, conn)
entity_df 



Unnamed: 0,skill,classification,classification_id
0,Federal Tort Claims Act,law,2
1,European Medical Device Regulation,law,2
2,FDA GMP,law,2
3,National Environmental Policy Act (NEPA),law,2
4,Resource Conservation and Recovery Act,law,2
...,...,...,...
66,DAWIA Level 3,law,2
67,Bank Secrecy Act,law,2
68,Code of Federal Regulations,law,2
69,Occupational Safety and Health Act,law,2


##### use syhtax pattern method - turned out not grate (same as ML results)

In [123]:
import spacy
import pandas as pd
nlp = spacy.load('en_core_web_lg')

In [124]:
def create_syntax_pattern(skill):
    doc = nlp(skill)
    pattern = []
    for token in doc:
        pattern.append(token.pos_ )
    return pattern

In [125]:
returned_pattern_law = []
for skill_title in approved_df.skill.tolist():
    results = create_syntax_pattern(skill_title)
    returned_pattern_law.append('-'.join(results)) 

In [126]:
law_pattern_df = pd.DataFrame({'pattern':returned_pattern_law})
law_pattern_df

Unnamed: 0,pattern
0,PUNCT-NOUN
1,X-VERB
2,PUNCT-NOUN
3,VERB-NOUN
4,NUM-NOUN-NOUN
...,...
12445,PROPN-PROPN
12446,NOUN
12447,NOUN
12448,PROPN


In [127]:
unique_law_patterns = law_pattern_df['pattern'].unique()
unique_law_patterns

array(['PUNCT-NOUN', 'X-VERB', 'VERB-NOUN', 'NUM-NOUN-NOUN', 'NUM-NOUN',
       'PROPN-PROPN', 'PROPN-PROPN-PROPN-PROPN', 'PROPN-PROPN-PROPN',
       'PROPN-NOUN', 'NUM-PROPN-PROPN', 'PROPN',
       'PROPN-PROPN-PUNCT-NOUN-PUNCT', 'ADJ-NOUN', 'NOUN-PROPN',
       'ADJ-NOUN-PUNCT-NOUN-NOUN', 'ADJ-PROPN-NOUN', 'X-PUNCT',
       'NUM-PROPN-PROPN-PROPN', 'DET-PROPN-PROPN-PROPN-PUNCT-PROPN-PUNCT',
       'NOUN-SYM-NOUN-NOUN', 'PROPN-VERB', 'ADV', 'PROPN-ADP-NOUN',
       'NOUN', 'ADJ-PROPN', 'PROPN-PROPN-PROPN-NUM',
       'PROPN-SYM-PROPN-PROPN', 'NOUN-NOUN', 'PROPN-CCONJ-PROPN',
       'NOUN-PUNCT-VERB-NOUN', 'NOUN-ADJ', 'NOUN-PROPN-PROPN',
       'PROPN-PROPN-ADP-NOUN', 'PROPN-PROPN-PROPN-PROPN-NOUN',
       'PROPN-ADJ-NOUN', 'PROPN-PUNCT-PROPN', 'VERB',
       'PROPN-PROPN-PROPN-PROPN-PROPN',
       'ADJ-PROPN-PROPN-PUNCT-PROPN-PUNCT', 'PROPN-PROPN-PROPN-NOUN',
       'VERB-ADJ-NOUN', 'VERB-PROPN-PROPN-PROPN-PROPN-CCONJ-PROPN-NOUN',
       'VERB-PROPN', 'PROPN-PROPN-NOUN-NOUN', 'PROPN-P

In [128]:
client_skill_df = pd.read_parquet("C:/Users/KeikoGolden/ComputationalLiguistics/KE218/unapproved_client.parquet")
client_skill_df

Unnamed: 0,skill,serial_id,_t,type,tool,client
11,Strategy,5a99f8eafc0da84a0c17cac1,SkillToolDictionaryEntity,2,0.0,"[{'ClientComplexity': {'ComplexityName': 'L3',..."
12,Physical Health Assessment,5a9bb429fc0da84a0c19462e,SkillToolDictionaryEntity,2,0.0,"[{'ClientComplexity': {'ComplexityName': 'L3',..."
13,Recruitment,5a99c346fc0da84a0c16c281,SkillToolDictionaryEntity,2,0.0,"[{'ClientComplexity': {'ComplexityName': 'L3',..."
14,Physically Fit Req,5a99e8e5fc0da84a0c16ecea,SkillToolDictionaryEntity,2,0.0,"[{'ClientComplexity': {'ComplexityName': 'L3',..."
15,Wiring,5a9c39e5fc0da84a0c1a7fa7,SkillToolDictionaryEntity,2,0.0,"[{'ClientComplexity': {'ComplexityName': 'L3',..."
...,...,...,...,...,...,...
52935,Project Work Plan Development,62d98bb61e31a01dd064382e,SkillToolDictionaryEntity,2,0.0,[]
52938,Pharmacy Sme,62d98c321e31a01dd0643b6d,SkillToolDictionaryEntity,2,0.0,[]
52943,Rapid7,62d9b1531e31a01dd064c130,SkillToolDictionaryEntity,2,0.0,[]
57525,Ab&c Regulations,6304f924dd3b4e9df8f3140c,SkillToolDictionaryEntity,2,0.0,[]


In [129]:
laws = []

for skill in client_skill_df.skill.tolist():
    results = create_syntax_pattern(skill) # Call the function create_syntax_pattern.
    returned_pattern2 = '-'.join(results)  # Using returned value, create syntax patterns
    if returned_pattern2 in unique_law_patterns: # check syntax patterns agasint the list of unique patterns created at 2.
        laws.append(skill) # when the unmatched syntax pattern is detected, store the skill in the non_sensical_skills list

In [113]:
laws

['Strategy',
 'Physical Health Assessment',
 'Recruitment',
 'Physically Fit Req',
 'Wiring',
 'Smokestack Repair',
 'Construction Estimating',
 'Document Preparation',
 'Driving',
 'Continuous Improvement',
 'Inventory Reporting',
 'Social Media',
 'Design',
 'Ambulatory Monitoring',
 'Health and Safety',
 'MS Office',
 'Financial Report Preparation',
 'Costume',
 'Problem-solving',
 'Market Studies',
 'Platemaking',
 'Customer Assistance',
 'Collections',
 'Tour Operation',
 'Food and Beverage Cleaning',
 'Light Duty Painting',
 'Chemical Process Technology',
 'Computer Software Development',
 'Software Design',
 'System Architecture Development',
 'Articular Manipulation',
 'Security Monitoring Software Implementation',
 'Elevator Construction',
 'Security Checks',
 'Landscaping (Commercial)',
 'Landscaping Equipment (Commercial)',
 'Software Development (Web Based)',
 'Railway Equipment Maintenance',
 'Baton Twirling',
 'Electrical Repair',
 'Drywall Application',
 'Paperhanging',


In [115]:
df = pd.DataFrame({'laws': laws})
df.to_excel('extracted_laws_by_pos_pattern.xlsx')

##### Use Regex - this tunred out to be better.

In [87]:
import regex as re

#regex = r'''(?i)(\sact$|[^f]\slaw|regulation|rights|directive|policy|statute|registration|amendment|agreement|standard)$'''
regex = r'''(?i)(\sact$|law|regulation|rights|directive|policy|statute|registration|amendment|agreement|standard)$'''

regex_matched_laws = []

for skill in client_skill_df.skill.tolist():
    match = re.findall(regex, skill)
    if match:
        regex_matched_laws.append(skill)


In [88]:
regex_matched_laws

['Compliance and Regulation',
 'Bachelors of Law',
 'Bachelor of Commerce and Bachelor of Law',
 'Immigration Law',
 'Saudi Labor Law',
 'Economic Policy',
 'Employment Law',
 'Comparative Law',
 'Group Policy',
 'International Business Law',
 'Public Policy',
 'Insurance Policy',
 'Social Services Policy',
 'Pre-law',
 'Philosophy, Public Policy',
 'Certified Paralegal Specializing in Family Law',
 'Corporate, Contract and Intellectual Property Law',
 'Prl Law',
 'Laboral Law',
 'Java Standard',
 'Medicaid Policy',
 'Medical Policy',
 'Medicare Dsh Claims and Regulation',
 'State Regulation',
 'Healthcare Law',
 'Azure Policy',
 'Stark Law',
 'Regulation',
 'Kmp313-dedicated Sensors & Act',
 'Anti-corruption Policy',
 'Rights',
 'Directive',
 'Vat Directive',
 'Safety Policy',
 'Media Policy',
 'Policy',
 'Cybersecurity Policy',
 'Exlhoist Standard',
 'Deregulation',
 'Registration',
 'Discount Policy',
 'Commercial Policy',
 'Standard',
 'Social Media Policy',
 'Lfu Agreement',
 'Len

# 7. Extract credentials

### Performance level
#####  1. ML Model - Extracted counts are more than regex however, 10% of it cinludes non credential skills
#####  2. Syntax pettern
#####  3. Regex 

In [117]:
returned_pattern_certificate = []
for skill_title in approved_df.skill.tolist():
    results = create_syntax_pattern(skill_title)
    returned_pattern_certificate.append('-'.join(results)) 

In [118]:
certificate_pattern_df = pd.DataFrame({'pattern':returned_pattern_certificate})
certificate_pattern_df

Unnamed: 0,pattern
0,PUNCT-NOUN
1,X-VERB
2,PUNCT-NOUN
3,VERB-NOUN
4,NUM-NOUN-NOUN
...,...
12445,PROPN-PROPN
12446,NOUN
12447,NOUN
12448,PROPN


In [119]:
unique_certificate_patterns = certificate_pattern_df['pattern'].unique()
unique_certificate_patterns

array(['PUNCT-NOUN', 'X-VERB', 'VERB-NOUN', 'NUM-NOUN-NOUN', 'NUM-NOUN',
       'PROPN-PROPN', 'PROPN-PROPN-PROPN-PROPN', 'PROPN-PROPN-PROPN',
       'PROPN-NOUN', 'NUM-PROPN-PROPN', 'PROPN',
       'PROPN-PROPN-PUNCT-NOUN-PUNCT', 'ADJ-NOUN', 'NOUN-PROPN',
       'ADJ-NOUN-PUNCT-NOUN-NOUN', 'ADJ-PROPN-NOUN', 'X-PUNCT',
       'NUM-PROPN-PROPN-PROPN', 'DET-PROPN-PROPN-PROPN-PUNCT-PROPN-PUNCT',
       'NOUN-SYM-NOUN-NOUN', 'PROPN-VERB', 'ADV', 'PROPN-ADP-NOUN',
       'NOUN', 'ADJ-PROPN', 'PROPN-PROPN-PROPN-NUM',
       'PROPN-SYM-PROPN-PROPN', 'NOUN-NOUN', 'PROPN-CCONJ-PROPN',
       'NOUN-PUNCT-VERB-NOUN', 'NOUN-ADJ', 'NOUN-PROPN-PROPN',
       'PROPN-PROPN-ADP-NOUN', 'PROPN-PROPN-PROPN-PROPN-NOUN',
       'PROPN-ADJ-NOUN', 'PROPN-PUNCT-PROPN', 'VERB',
       'PROPN-PROPN-PROPN-PROPN-PROPN',
       'ADJ-PROPN-PROPN-PUNCT-PROPN-PUNCT', 'PROPN-PROPN-PROPN-NOUN',
       'VERB-ADJ-NOUN', 'VERB-PROPN-PROPN-PROPN-PROPN-CCONJ-PROPN-NOUN',
       'VERB-PROPN', 'PROPN-PROPN-NOUN-NOUN', 'PROPN-P

In [120]:
certificates = []

for skill in client_skill_df.skill.tolist():
    results = create_syntax_pattern(skill) # Call the function create_syntax_pattern.
    returned_pattern2 = '-'.join(results)  # Using returned value, create syntax patterns
    if returned_pattern2 in unique_certificate_patterns: # check syntax patterns agasint the list of unique patterns created at 2.
        certificates.append(skill)

In [121]:
certificates

['Strategy',
 'Physical Health Assessment',
 'Recruitment',
 'Physically Fit Req',
 'Wiring',
 'Smokestack Repair',
 'Construction Estimating',
 'Document Preparation',
 'Driving',
 'Continuous Improvement',
 'Inventory Reporting',
 'Social Media',
 'Design',
 'Ambulatory Monitoring',
 'Health and Safety',
 'MS Office',
 'Financial Report Preparation',
 'Costume',
 'Problem-solving',
 'Market Studies',
 'Platemaking',
 'Customer Assistance',
 'Collections',
 'Tour Operation',
 'Food and Beverage Cleaning',
 'Light Duty Painting',
 'Chemical Process Technology',
 'Computer Software Development',
 'Software Design',
 'System Architecture Development',
 'Articular Manipulation',
 'Security Monitoring Software Implementation',
 'Elevator Construction',
 'Security Checks',
 'Landscaping (Commercial)',
 'Landscaping Equipment (Commercial)',
 'Software Development (Web Based)',
 'Railway Equipment Maintenance',
 'Baton Twirling',
 'Electrical Repair',
 'Drywall Application',
 'Paperhanging',


In [122]:
df = pd.DataFrame({'certificate': certificates})
df.to_excel('extracted_certificate_by_pos_pattern.xlsx')