# To create one-hot encodings of technologies in job descriptions

In [246]:
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import os
from datetime import date

In [247]:
os.getcwd()

'C:\\Users\\lundr\\DataScienceJobs'

# Access data from SQL Database
* Create engine
* Query data
* Create a list of job description texts from pandas df read in
* Clean data

In [296]:
# create engine to read from SQL
PASSWORD = pd.read_pickle('C:/Users/lundr/DataScienceJobs/data/SQL_password.pkl').iloc[0,0]
engine = create_engine('postgresql://postgres:'+PASSWORD+'@dsj-1.c9mo6xd9bf9d.us-west-2.rds.amazonaws.com:5432/')


target = pd.read_sql(''' SELECT *FROM all_data WHERE salary_average > 0 AND salary_type ='yearly' ''', engine)


In [314]:
job_title_selections = pd.DataFrame([ 1 if ('data scientist' in str(target['job_title'][x]).lower()) else (0) for x in range(len(target['job_title']))])
job_title_selections.columns = ['selection']
target = target.join(job_title_selections)

In [316]:
target_filtered = target[target['selection'] == 1]
target_filtered

Unnamed: 0,level_0,index,job_title,ref_code,company,description,salary,salary_low,salary_high,currency,...,jobtype,posted_date,extraction_date,country,region,url,train_test_label,id,language,selection
3,15,26.0,DATA SCIENTIST (m/w),58966/MM,Harnham,hamburg - ...,€65000 - €85000 per annum + BENEFITS,65000.0,85000.0,€,...,Permanent,,2019-11-14,Germany,Hamburg,https://www.harnham.com/job/data-scientist-m-w...,train,18,de,1
7,23,0.0,Senior Data Scientist,Vacancy #53029,Harnham,up to london the company th...,£80000 - £110000 per annum,80000.0,110000.0,£,...,Permanent,,2019-11-14,UK,London,/job/senior-data-scientist-in-london-jid-20612,train,26,en,1
19,58,1237.0,"Bioinformatics Data Scientist $120,000 - $130,...",0000,Harnham,the company by combining advanced ...,US$120000 - US$140000 per year,120000.0,140000.0,$,...,Permanent,,2019-11-14,USA,New York,/job/bioinformatics-data-scientist-in-new-york...,train,64,en,1
20,61,1314.0,Senior Data Scientist,69544,Harnham,nyc - base salary bonus ...,US$160000 - US$190000 per annum + Bonus + Comp...,160000.0,190000.0,$,...,Permanent,,2019-11-14,USA,New York,/job/senior-data-scientist-luxury-lifestyle-in...,test,67,en,1
27,77,,Senior Data Scientist,bbbh20317_1571061556,Talent International,"Role: Senior Data ScientistSalary: £45,000 - £...","£45,000.00 - £75,000.00 per year",45000.0,75000.0,£,...,permanent,2019-11-11,,UK,London,https://job-openings.monster.co.uk/senior-data...,train,83,en,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6483,20408,,Data Scientist,,Piper Companies,['Piper Companies is currently looking for a D...,"$130,000 - $175,000 a year",130000.0,175000.0,$,...,,2019-11-19,2019-11-19,USA,Virginia,https://www.indeed.com/rc/clk?jk=31ec178b837ec...,train,19390,en,1
6514,20503,,Senior Data Scientist,,"Amsterdam - upto €100,000 - JWCG LTD",A Major international Retailer is looking for ...,"€70,000.00 - €100,000.00 per year",70000.0,100000.0,€,...,permanent,2019-11-18,2019-11-19,Netherlands,North Holland,https://job-openings.monster.co.uk/senior-data...,train,20233,en,1
6522,20515,,Data Scientist,cn/rd/ds,IF RECRUITMENT LTD,ROLE: DATA SCIENTISTLOCATIO...,"€45,000.00 - €65,000.00 per yearBonus & Shares",45000.0,65000.0,€,...,permanent,2019-11-19,2019-11-19,Netherlands,South Holland,https://job-openings.monster.co.uk/data-scient...,train,20355,en,1
6523,20518,,Senior Data Scientist,bbbh20628_1572265654,Talent International,"Data ScientistSalary: up to £69,300Location: L...","£45,000.00 - £69,300.00 per year",45000.0,69300.0,£,...,permanent,2019-11-18,2019-11-19,UK,London,https://job-openings.monster.co.uk/senior-data...,test,20367,en,1


In [320]:
text = pd.DataFrame(target_filtered['description'])

In [321]:
text

Unnamed: 0,description
3,hamburg - ...
7,up to london the company th...
19,the company by combining advanced ...
20,nyc - base salary bonus ...
27,"Role: Senior Data ScientistSalary: £45,000 - £..."
...,...
6483,['Piper Companies is currently looking for a D...
6514,A Major international Retailer is looking for ...
6522,ROLE: DATA SCIENTISTLOCATIO...
6523,"Data ScientistSalary: up to £69,300Location: L..."


In [322]:
texts = [x.lower() for x in text['description']]

In [323]:
for i in range(len(texts)):
    a = texts[i].replace("["," ")
    a = a.replace("\n"," ")
    a = a.replace("]"," ")
    a = a.replace("."," ")
    a = a.replace(","," ")
    a = a.replace(":"," ")
    a = a.replace(";"," ")
    a = a.replace('"'," ")
    a = a.replace('('," ")
    a = a.replace(')'," ")
    a = a.replace('\\'," ")
    a = a.replace('/'," ")
    texts[i]=a

# Read in tech dictionary and reorganise for encoding

In [324]:
tech_dict =  pd.read_pickle('Pickles/broad_tech_dictionary.pickle')
tech_dict.keys()

dict_keys(['front_end-technologies', 'databases', 'quality_assurance-qa', 'game_development', 'software-infrastructure-devops', 'web_design', 'product_management', 'development_methodologies', 'software_architecture', 'fundamental_programming_concepts', 'programming_paradigms', 'data-science', 'tools', 'roles_in_software_development', 'embedded', 'cloud_computing', 'cyber_security', 'general_terms', 'back_end-technologies', 'mobile'])

* select categories of tech glossary to use for encoding and create a list of their corresponding tech terms

In [325]:

area = ['front_end-technologies', 'databases', 'software-infrastructure-devops','data-science','software_architecture', 'web_design','tools','cyber_security','cloud_computing','back_end-technologies', 'mobile']

tech_list=[]

for i in area:
    for j in range(len(tech_dict[i])):
        tech_list.append(tech_dict[i][j])


* make all terms lowercase make sure only unique value in list

In [326]:
important_terms =list(set([x.lower() for x in tech_list]))


* Create a dictionary to group terms which actually refer to the same thing

In [327]:
d = { ' bi ':' business intelligence ', ' ai ':' artifical intelligence ', ' databases ':' database ',' db ':' database ',' aws ':' amazon web services '}

# Create the one hot encoding by technology
* search for terms within description, creating a new column in the df which lists matched terms
* explode these into one-hot encodings for technolgies


In [328]:

dj = pd.DataFrame({'T':texts})

def get_imp_terms(input_string):
    result = [ d.get(x,x) for x in important_terms if x in input_string]
    return list(set(result))

dj['iR']=dj['T'].map(get_imp_terms)
dj['iR'] = [unique_values(x) for x in dj['iR']]

mlb = MultiLabelBinarizer(classes = important_terms)
dk = pd.DataFrame(mlb.fit_transform(dj['iR']), columns=important_terms)
dl = dj.join(dk)

dl


Unnamed: 0,T,iR,cross-platform mobile (mostly javascript),activemq,slf4j,xampp,spark,loki,docker swarm,nhibernate,...,amazon kinesis,network address translation,information security,amazon ec2,neo4j,mockup,markdown,final cut pro,metal,clr
0,hamburg - ...,"[ big data , spark , natural language proces...",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,up to london the company th...,"[ deep learning , machine learning , python ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,the company by combining advanced ...,"[ data warehousing , shell , database , r ,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,nyc - base salary bonus ...,"[ spark , mllib , java , hive , mxnet , s...",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,role senior data scientistsalary £45 000 - £...,"[ tableau , agile , , business intelligenc...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
823,'piper companies is currently looking for a d...,"[ big data , spark , amazon web services , ...",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
824,a major international retailer is looking for ...,"[ keras , deep learning , c , c++ , , te...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
825,role data scientistlocatio...,"[ r , amazon web services , scikit-learn , ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
826,data scientistsalary up to £69 300location l...,"[ tableau , agile , , business intelligenc...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


* join target variable

In [329]:
target_filtered = target_filtered.drop(columns ='description')

In [330]:
out = dl.join(target_filtered)

# pickle out the transformed data for use elsewhere

In [331]:
out.to_pickle('data/tech_encoded_data_data_scientist'+str(date.today())+".pkl")