# ‘Inclusive Job Ads project’ - Tech/Non-tech tagging
Author: Shravankumar Hiregoudar

## Import the libraries

In [1]:
import pandas as pd
import itertools
import re
from nltk.corpus import stopwords
%matplotlib inline

## Import the technical keywords list and job excel file

## Creating the techincal word list

In [2]:
def getAllKeywords(techTagOp):
    """
        :getAllKeywords: Get all the keywords from technical/non-technical job description.

        :param techTagOp: The input of the funcation is dataframe with just tech or non-tech jobs
        :type techTagOp: pandas.core.frame.DataFrame

        :returns: Set of unique keywords excluding special characters, stopwords and ignoring case sensitive.
    """
        
    # Tokenize
    techTagTokens = [sub.split() for sub in set(techTagOp['Text Description'].fillna(""))]
    # convert list of list to list
    techTagUnqList = set(list(itertools.chain(*techTagTokens)))
    # Remove special characters
    techTagUnqListSpl = set([re.sub('[^a-zA-Z0-9]+', '', _) for _ in techTagUnqList])
    # Remove stop words
    techTagUnqListSplStop = [word for word in techTagUnqListSpl if word not in stopwords.words('english')]
    # case insensitive
    techTagUnqListSplStopCase = set({v.casefold(): v for v in techTagUnqListSplStop}.values())
    # Non-integers only
    techTagUnq = set(sorted([item for item in techTagUnqListSplStopCase if not item.isdigit()]))
    
    return (techTagUnq)

## Compare and flag

In [3]:
# Convert long string as list of strings
def stringToList(string):
    if not pd.isnull(string):
        listRes = list(string.split(" "))
        return [re.sub('[^a-zA-Z0-9]+', '', _) for _ in listRes]
    else:
        return " "

In [4]:
# Upon looping, The mappedTokens will contain list of sets, where len(list) = len(total jobs)
# Example: [ set(),{'ARM','Agile','Agilebased'},......... ] 
# implies the first job ad contains zero tech words, the second job ad contains 'ARM','Agile','Agilebased' technical words..

def mapTokens(job_ads_attribute, tech_words_list):
    """
        :mapTokens: Get all the keywords from technical/non-technical job description. The input of the funcation is dataframe with just tech or non-tech jobs

        :param job_ads_attribute: 'Text Description' or 'Job Order: External Job Title'
        :type job_ads_attribute: pandas.core.series.Series

        :returns: List of words in 'tech_words_list' are mapped with 'Text Description' or 'Job Order: External Job Title'
    """
    mappedTokens = [] 
    for i, role in enumerate (job_ads_attribute):
        mappedTokens.append(set(stringToList(job_ads_attribute[i])) & set(tech_words_list))
    return mappedTokens

In [5]:
def tech_tag(inputFile, jobDescriptionCol, mappedTokensText, mappedTokensTitle):
    """
        :tech_tag: Tag the tech and non-tech based on the 'Text Description' or 'Job Order: External Job Title'

        :param mappedTokensText: List of words in 'tech_words_list' are mapped with 'Text Description'
        :type mappedTokensText: list
        
        :param mappedTokensTitle: List of words in 'tech_words_list' are mapped with 'Job Order: External Job Title'
        :type mappedTokensTitle: list

        :returns: job_ads dataframe with additional column 'Tech_Flag'
    """
    # The dummy job descriptions/ templates have this sentence in common
    dummyJD = 'Avoid using a laundry list of technologies and/or skills'

    job_ads = pd.read_csv(inputFile)

    for i in range(len(job_ads)):
        # if the job 'Text Description' is empty or dummy -- Tag it based on the Job Title ; else use 'Text Description' to tag tech and non-tech.
        if not pd.isnull(job_ads[jobDescriptionCol][i]) and dummyJD not in job_ads[jobDescriptionCol][i]:
            # If we have mapped tokens between 'Text Description' and tech_words_list -> Tag 'Tech' else 'Non-Tech'
            if len(mappedTokensText[i]):
                job_ads.at[i,'Tech_Flag'] = 'Tech'
            else:
                job_ads.at[i,'Tech_Flag'] = 'Non-Tech'
        else:
            if len(mappedTokensTitle[i]):
                job_ads.at[i,'Tech_Flag'] = 'Tech'
            else:
                job_ads.at[i,'Tech_Flag'] = 'Non-Tech'
    
    return job_ads

# Consolidate everything

In [6]:
def create_tech_flags(input_file, job_title_col_name, job_description_col):
    job_ads = pd.read_excel('technical_flag_reference_data.xlsx')
    tech_words = pd.read_excel('technical_flag_reference_tech_words.xlsx').dropna()
    tech_words_list = tech_words['Words'].tolist()

    # What words in 'tech_words_list' are mapped with 'Text Description'
    input_data = pd.read_csv(input_file)
    mappedTokenText = mapTokens(input_data[job_description_col], tech_words_list)
    # mappedTokensText = mapTokens(job_ads['Text Description'])

    # What words in 'tech_words_list' are mapped with 'Job title'
    mappedTokenTitle = mapTokens(input_data[job_title_col_name], tech_words_list)
    # mappedTokensTitle = mapTokens(job_ads['Job Order: External Job Title'])

    output_data = tech_tag(input_file, job_description_col, mappedTokenText, mappedTokenTitle)

    return output_data
    

In [7]:
input_file_dir = 'sample_job_descriptions.csv'
job_title_col_name = 'Job Title'
job_description_col = 'Job Description'
result = create_tech_flags(input_file_dir, job_title_col_name, job_description_col)

In [8]:
result['Tech_Flag'].value_counts()

Tech        173
Non-Tech     26
Name: Tech_Flag, dtype: int64

In [9]:
result.groupby(['Tech_Flag', 'Job Title'])['Job Title'].count()

Tech_Flag  Job Title                                 
Non-Tech   2018 - NGI Leader                             1
           2018 - Sales Executive                        1
           2018 Life Sciences Engagement Manager         1
           2018 Project Manager - Delivery Leadership    1
           2019 Analyst - August                         1
                                                        ..
Tech       Bilingual IT Support Analyst                  1
           Billing Analyst                               1
           Billing Supervisor                            1
           Boeing - BA (Mike)                            1
           Boeing - BI Dev (Marci Farrell)               1
Name: Job Title, Length: 199, dtype: int64