# To create one-hot encodings of technologies in job descriptions

In [1]:
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import os
from datetime import date

In [2]:
os.getcwd()

'C:\\Users\\lundr\\DataScienceJobs\\Cleaning'

# Access data from SQL Database
* Create engine
* Query data
* Create a list of job description texts from pandas df read in
* Clean data

In [25]:
# create engine to read from SQL
PASSWORD = pd.read_pickle('C:/Users/lundr/DataScienceJobs/data/SQL_password.pkl').iloc[0,0]
engine = create_engine('postgresql://postgres:'+PASSWORD+'@dsj-1.c9mo6xd9bf9d.us-west-2.rds.amazonaws.com:5432/')


target = pd.read_sql(''' SELECT *FROM all_data WHERE salary_average > 0 AND salary_type ='yearly' ''', engine)


In [26]:
job_title_selections = pd.DataFrame([ 1 if ('data scientist' in str(target['job_title'][x]).lower()) else (0) for x in range(len(target['job_title']))])
job_title_selections.columns = ['selection']
target = target.join(job_title_selections)

In [27]:
target_filtered = target[target['selection'] == 1]
target_filtered

Unnamed: 0,level_0,index,job_title,ref_code,company,description,salary,salary_low,salary_high,currency,...,jobtype,posted_date,extraction_date,country,region,url,train_test_label,id,language,selection
1,1,,Data Scientist,02/10/19_1570002973,"Reading-£65,000 - Churchill Frank",Job DescriptionRole & Responsibilities develop...,,,,,...,permanent,2019-10-24,,UK,South East,https://job-openings.monster.co.uk/data-scient...,train,2,en,1
6,6,,Senior Data Scientist,000_106,WHITEHAT ANALYTICS LIMITED,ABOUT WHITEHAT ANALYTICSWhitehat Analytics is ...,,,,,...,permanent,2019-10-23,,UK,London,https://job-openings.monster.co.uk/senior-data...,test,8,en,1
8,8,,Data Scientist,bbbh17654_1571757159,Allen Recruitment Consulting,Data Scientist12-month contract based in Londo...,,,,,...,others,2019-10-24,,UK,London,https://job-openings.monster.co.uk/data-scient...,train,11,en,1
10,10,,Data Scientist/Specialist,,BBVA Compass,['Company: Compass Bank dba BBVA Compass\nLoca...,,,,,...,,2019-11-01,,USA,Texas,https://www.indeed.com/rc/clk?jk=6e09d268f3900...,train,13,en,1
15,15,26.0,DATA SCIENTIST (m/w),58966/MM,Harnham,hamburg - ...,€65000 - €85000 per annum + BENEFITS,65000.0,85000.0,€,...,Permanent,,2019-11-14,Germany,Hamburg,https://www.harnham.com/job/data-scientist-m-w...,train,18,de,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20517,20517,,Data Scientist,50204581_1573139056,Lorien Resourcing Limited,Data ScientistAs one of Vodafone key strategic...,,,,,...,others,2019-11-14,2019-11-19,UK,London,https://job-openings.monster.co.uk/data-scient...,train,20359,en,1
20518,20518,,Senior Data Scientist,bbbh20628_1572265654,Talent International,"Data ScientistSalary: up to £69,300Location: L...","£45,000.00 - £69,300.00 per year",45000.0,69300.0,£,...,permanent,2019-11-18,2019-11-19,UK,London,https://job-openings.monster.co.uk/senior-data...,test,20367,en,1
20519,20519,,Senior Data Scientist,j10788,DataTech Search and Selection,Senior Data Scientist - Salary negotiable - Ty...,,,,,...,permanent,2019-11-19,2019-11-19,UK,North East,https://job-openings.monster.co.uk/senior-data...,train,20372,en,1
20520,20520,,Data Scientist,2657-4282,A1 People Limited,Position: Data ScientistLocation: LondonSalar...,"£25,000.00 - £30,000.00 per year",25000.0,30000.0,£,...,permanent,2019-11-13,2019-11-19,UK,London,https://job-openings.monster.co.uk/data-scient...,train,20374,en,1


In [28]:
text = pd.DataFrame(target_filtered['description'])

In [29]:
text

Unnamed: 0,description
1,Job DescriptionRole & Responsibilities develop...
6,ABOUT WHITEHAT ANALYTICSWhitehat Analytics is ...
8,Data Scientist12-month contract based in Londo...
10,['Company: Compass Bank dba BBVA Compass\nLoca...
15,hamburg - ...
...,...
20517,Data ScientistAs one of Vodafone key strategic...
20518,"Data ScientistSalary: up to £69,300Location: L..."
20519,Senior Data Scientist - Salary negotiable - Ty...
20520,Position: Data ScientistLocation: LondonSalar...


In [30]:
texts = [x.lower() for x in text['description']]

In [31]:
for i in range(len(texts)):
    a = texts[i].replace("["," ")
    a = a.replace("\n"," ")
    a = a.replace("]"," ")
    a = a.replace("."," ")
    a = a.replace(","," ")
    a = a.replace(":"," ")
    a = a.replace(";"," ")
    a = a.replace('"'," ")
    a = a.replace('('," ")
    a = a.replace(')'," ")
    a = a.replace('\\'," ")
    a = a.replace('/'," ")
    texts[i]=a

# Read in tech dictionary and reorganise for encoding

In [15]:
os.chdir('..')

In [32]:
tech_dict =  pd.read_pickle('Pickles/broad_tech_dictionary.pickle')
tech_dict.keys()

dict_keys(['front_end-technologies', 'databases', 'quality_assurance-qa', 'game_development', 'software-infrastructure-devops', 'web_design', 'product_management', 'development_methodologies', 'software_architecture', 'fundamental_programming_concepts', 'programming_paradigms', 'data-science', 'tools', 'roles_in_software_development', 'embedded', 'cloud_computing', 'cyber_security', 'general_terms', 'back_end-technologies', 'mobile'])

* select categories of tech glossary to use for encoding and create a list of their corresponding tech terms

In [33]:

area = ['front_end-technologies', 'databases', 'software-infrastructure-devops','data-science','software_architecture', 'web_design','tools','cyber_security','cloud_computing','back_end-technologies', 'mobile']

tech_list=[]

for i in area:
    for j in range(len(tech_dict[i])):
        tech_list.append(tech_dict[i][j])


* make all terms lowercase make sure only unique value in list

In [34]:
important_terms =list(set([x.lower() for x in tech_list]))


* Create a dictionary to group terms which actually refer to the same thing

In [35]:
d = { ' bi ':' business intelligence ', ' ai ':' artifical intelligence ', ' databases ':' database ',' db ':' database ',' aws ':' amazon web services '}

# Create the one hot encoding by technology
* search for terms within description, creating a new column in the df which lists matched terms
* explode these into one-hot encodings for technolgies


In [36]:

dj = pd.DataFrame({'T':texts})

def get_imp_terms(input_string):
    result = [ d.get(x,x) for x in important_terms if x in input_string]
    return list(set(result))

dj['iR']=dj['T'].map(get_imp_terms)
dj['iR'] = [x for x in dj['iR']]

mlb = MultiLabelBinarizer(classes = important_terms)
dk = pd.DataFrame(mlb.fit_transform(dj['iR']), columns=important_terms)
dl = dj.join(dk)

dl


Unnamed: 0,T,iR,sap,gui,jface,protobuf,accelerate framework,micro-cap,saltstack,spring integration,...,justinmind,gtest,mesos,ocaml,debian,ksql,apt,carthage,cfengine,ca plex
0,job descriptionrole & responsibilities develop...,"[ matplotlib , pandas , python , machine le...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,about whitehat analyticswhitehat analytics is ...,"[ linux , seaborn , flask , python , api ,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,data scientist12-month contract based in londo...,"[ scikit-learn , pandas , agile , python , ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,'company compass bank dba bbva compass nloca...,"[ r , database , c , , compass , access ]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,hamburg - ...,"[ deep learning , big data , im , apis , i...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2754,data scientistas one of vodafone key strategic...,"[ scipy , deep learning , dnn , big data , ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2755,data scientistsalary up to £69 300location l...,"[ business intelligence , agile , python , ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2756,senior data scientist - salary negotiable - ty...,"[ r , sas , sql , ]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2757,position data scientistlocation londonsalar...,"[ r , quick , python , tableau , , sql ]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


* join target variable

In [38]:
target_filtered = target_filtered.drop(columns ='description')

In [39]:
out = dl.join(target_filtered)

# pickle out the transformed data for use elsewhere

In [40]:
out.to_pickle('data/tech_encoded_data_data_scientist_all'+str(date.today())+".pkl")