In [92]:
import pandas as pd
import spacy
from tqdm import tqdm

In [93]:
df = pd.read_csv('all_gpt_labelled_data.csv')
df.head()

Unnamed: 0,job,skills,url
0,SummaryThe Database Developer is part of the C...,"['C#', '.Net', 'database performance', 'C#', '...",https://www.linkedin.com/jobs/view/database-de...
1,"Come Work with Us!At RBC, our culture is deepl...","['data science', 'RBC Elements', 'Statistical'...",https://ca.linkedin.com/jobs/view/data-scienti...
2,"Job IntroductionYou’ll be in an exciting, stim...","['identity', 'SQL', 'NoSQL', 'Javascript', 'Gi...",https://uk.linkedin.com/jobs/view/software-eng...
3,The Information Technology (IT) Department aim...,"['Cybersecurity', 'design', 'Cybersecurity', '...",https://ca.linkedin.com/jobs/view/director-of-...
4,Are you an Analytics professional experienced ...,"['Analytics', 'Insurance', 'Strategy', 'Analyt...",https://www.linkedin.com/jobs/view/manager-ai-...


## Getting filtered skills

In [94]:
files = ['Data/skill_ashwin_filtered.txt',
        'Data/skill_jaskirat_filtered.txt',
        'Data/skill_sai_filtered.txt',
        'Data/skill_jinhong_filtered.txt',
        'Data/skill_rakesh_filtered.txt']


skill_set = set()
for file in files:
    file = open(file)
    skill_set.update(file.read().split('\n'))
len(skill_set)

1912

In [95]:
skill_set

{'statistics',
 'ajax',
 'visual concepts',
 'virtualitics',
 'hci',
 'techinsights',
 'sas',
 'multivariate regression',
 'network security',
 'tactical',
 'security software',
 'c++',
 'pl',
 'erp',
 'analytical ability',
 'predictive analytics',
 'product management',
 'matrix-management',
 'linux',
 'github prs',
 'iphone',
 'web platform',
 'os installations',
 'mat',
 'troubleshooting',
 'reporting',
 'devops',
 'data quality',
 'photoshop',
 'remote access',
 'monitoring',
 'digital forensics',
 'lp analyst',
 'sql/ssis developer',
 'flux',
 'sip',
 'mac',
 'software engineering',
 'data manipulation',
 'ux/ui',
 'apache camel',
 'drupal',
 'mern technologies',
 'problem identification',
 'sfmc',
 'transportation',
 'database design',
 'leadership principles',
 'strategic growth',
 'biomedical nlp',
 'critical thinking',
 'power',
 'keras',
 'implementation',
 'backend engineer',
 'prototypes',
 'flex',
 'salary',
 'transit',
 'collaborative spirit',
 'database mysql',
 'power p

### filtering skills

In [96]:
def filter_skills(skills):
    skill_list = [i.strip()[1:-1] for i in skills[1:-1].split(',')]
    return ', '.join([i for i in set(skill_list) if i.lower() in skill_set])

filter_skills(df['skills'][0])

'teamwork, Project management, SQL, CEMCO, C#, database performance, close vision, T-SQL, distance vision, LLC, .Net, .NET'

In [97]:
df['filtered_skills'] = df['skills'].apply(filter_skills)
df[['skills','filtered_skills']].sample(10)

Unnamed: 0,skills,filtered_skills
920,"['JavaScript', 'React', 'HTML5', 'CSS3', 'Java...","HTML5, React, CSS3, JavaScript"
787,"['Mobile apps', 'cloud-based', 'data mining', ...","data mining, cloud-based, Typesc, statistical ..."
893,"['DBA', 'Oracle', 'PL', 'Ksh']","PL, Oracle, DBA, Ksh"
1039,"['Strategic advisory', 'leadership', 'tailored...","Jr. DBA, Sql, telecom technology, Strategic ad..."
1125,"['QA', 'REST APIs', 'microservices', 'C#', 'Go...","C#, REST APIs, Java, microservices, Go"
1117,"['national strength', 'RPA', 'Power Automate',...","NLP, Power Automate, Microsoft Power, RPA"
794,"['Healthcare', 'billing', 'administrative over...","data science, Docker, PostgreSQL, Python"
1163,"['Cloud Engineer', 'AWS', 'Azure', 'Cloud Serv...","Cloud Budget, Cloud Administration, Azure, bus..."
1106,"['AWS', 'Terraform', 'scripting', 'Jenkins', '...","JavaScript, Terraform, IT, SonarQube, Angular,..."
609,"['Apple', 'Apple', 'user experience']",


In [98]:
df.head()

Unnamed: 0,job,skills,url,filtered_skills
0,SummaryThe Database Developer is part of the C...,"['C#', '.Net', 'database performance', 'C#', '...",https://www.linkedin.com/jobs/view/database-de...,"teamwork, Project management, SQL, CEMCO, C#, ..."
1,"Come Work with Us!At RBC, our culture is deepl...","['data science', 'RBC Elements', 'Statistical'...",https://ca.linkedin.com/jobs/view/data-scienti...,"Data Science, Statistical, data science, Mathe..."
2,"Job IntroductionYou’ll be in an exciting, stim...","['identity', 'SQL', 'NoSQL', 'Javascript', 'Gi...",https://uk.linkedin.com/jobs/view/software-eng...,"Javascript, NoSQL, SQL, Git, Jenkins"
3,The Information Technology (IT) Department aim...,"['Cybersecurity', 'design', 'Cybersecurity', '...",https://ca.linkedin.com/jobs/view/director-of-...,"design, ISO27001, Cybersecurity"
4,Are you an Analytics professional experienced ...,"['Analytics', 'Insurance', 'Strategy', 'Analyt...",https://www.linkedin.com/jobs/view/manager-ai-...,"AI, machine learning, fleet, Consulting, Analy..."


In [99]:
df.to_csv('Data/GPT-3_skills_filtered_data.csv')

### Splitting job_descriptions into smaller segments to avoid token limit violation for our models

In [100]:
nlp = spacy.load('en_core_web_sm')

def sentence_segmentation(job_desc):
    """
    This function return a list of sentences for a job description
    """
    doc = nlp(job_desc)
    return [sent for sent in doc.sents]


def get_paras(sents, skills_list, limit = 128):
    '''
    Converts a list of sentences to a list of paragraphs (so that we could limit the number of API calls to some extent)
    '''
    num_token = 0
    paragraphs = []
    sub_skills = []
    para = ''
    for sent in sents:
        sent = sent.text
        num_token += len(sent.split())
        if num_token >= limit:
            paragraphs.append(para)
            sub_skills.append(', '.join([i for i in skills_list if i in para]))
            para = ''
            num_token = 0
        para += sent
    paragraphs.append(para)
    sub_skills.append(', '.join([i for i in skills_list if i in para]))
    return paragraphs, sub_skills

In [101]:
paras_list = []
skills_list = []
job_id = []
i = 0
for job, skills in tqdm(zip(df['job'], df['filtered_skills'])):
    sents = sentence_segmentation(job)
    skills_list_temp = [i.strip() for i in skills.split(',')]
    paras, skill = get_paras(sents, skills_list_temp, limit = 128)
    paras_list.extend(paras)
    skills_list.extend(skill)
    job_id.extend([i for j in range(len(skill))])
    i+=1

1842it [02:25, 12.70it/s]


In [102]:
df_split = pd.DataFrame({'job_id': job_id, 'job_segment':paras_list, 'skills':skills_list})
df_split

Unnamed: 0,job_id,job_segment,skills
0,0,SummaryThe Database Developer is part of the C...,"CEMCO, C#, database performance, .Net"
1,0,Perform database security administration.Ensur...,"C#, .Net"
2,0,"Education And/Or Experience, Certifications, R...","teamwork, Project management, SQL, T-SQL"
3,0,Physical DemandsWhile performing the duties of...,"close vision, distance vision"
4,0,"CEMCO, LLC, is the premier manufacturer of ste...","CEMCO, LLC"
...,...,...,...
6686,1840,Sr. Full Stack DeveloperWelcome to Beaker & Wr...,
6687,1840,We need a full-stack developer who is a self-s...,
6688,1840,The desired developer will head development an...,"HTML, Wordpress, Python"
6689,1841,"Become part of one of a large, high-performing...",CI/CD


In [103]:
df.to_csv('Data/GPT-3_data_split_into_128_word_paragraphs.csv')