In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from sklearn.metrics import (roc_auc_score, roc_curve, auc, accuracy_score, classification_report, 
confusion_matrix, accuracy_score)

from sklearn.model_selection import (cross_val_score, cross_val_predict, train_test_split, 
StratifiedKFold, GridSearchCV)

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier

from sklearn.svm import SVC

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.pipeline import Pipeline

from textblob import TextBlob, Word

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/mbp15/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mbp15/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
'''
In this model, I am looking to find the key features from the job description using regression to make predictions
on job salary. This will allow me to find a monetary value associated with each specific skill. I will manually select
top skills from the results and then apply them with random forest regression to determine value of each skill (Question 2,
part 4).
'''

df = pd.read_csv('./seek3.csv', index_col = 0)

In [4]:
df

Unnamed: 0,job_title,job_location,job_salary,advertiser,search_category,url,job_post_date,job_desc,salary_desc,super,payt2,pay_total2
1,Data Science Delivery Lead,Sydney,165000.0,Intellify,Data Science,www.seek.com.au/job/38267034?type=standout&sea...,5 Feb 2019,About Intellify Intellify is a data science an...,"$150,000 - $180,000 package",1.0,165000.0,165000.0
3,Machine Learning Engineer / Scientist - Optimi...,Sydney,160000.0,Infopeople,Data Science,www.seek.com.au/job/38178979?type=standard&sea...,23 Jan 2019,This role is a newly created role to work with...,Circa upto 160k based on skills,1.0,160000.0,160000.0
4,Data Analytics & Data Science Team Leader,Sydney,115046.5,AUSTRALIAN FEDERAL POLICE,Data Science,www.seek.com.au/job/38194317?type=standout&sea...,25 Jan 2019,The mission of the AFP is to provide dynamic a...,"$110,172.00 - $119,921.00",1.0,115046.5,115046.5
8,Senior Data Science (Loyalty) :: $170K + Super...,Sydney,170000.0,Correlate Resources,Data Science,www.seek.com.au/job/38153321?type=standout&sea...,21 Jan 2019,Senior Data Scientist (Loyalty) :: $170K + Sup...,$170K+Super+ Bonus,1.1,170000.0,187000.0
9,Senior Data Scientist,Sydney,140000.0,Preacta Recruitment,Data Science,www.seek.com.au/job/38265180?type=standard&sea...,5 Feb 2019,Currently working for one of Australia’s leadi...,"$120,000 to $160,000",1.0,140000.0,140000.0
11,Data Scientist,Sydney,100000.0,Talent – Winner ‘Seek Large Recruitment Agency...,Data Science,www.seek.com.au/job/38112245?type=standout&sea...,15 Jan 2019,Data Scientist - Cognitive Science & AIJoin th...,$80000.00 - $120000 per annum,1.0,100000.0,100000.0
15,Data Scientist,Sydney,145000.0,Precision Sourcing,Data Science,www.seek.com.au/job/38140287?type=standout&sea...,18 Jan 2019,The CompanyThis is one of the most exciting co...,$130k - $160k p.a.,1.0,145000.0,145000.0
18,Senior Business Intelligence Consultant,Sydney,85000.0,AI Australia,Data Science,www.seek.com.au/job/38225977?type=standout&sea...,30 Jan 2019,As a leading Australian owned Artificial Intel...,"$75,000 - $95,000",1.0,85000.0,85000.0
27,Research Scientist - Blockchain and Software ...,Sydney,167500.0,CSIRO,Data Science,www.seek.com.au/job/38200815?type=standout&sea...,25 Jan 2019,Apply your expertise in computer science/soft...,$134K - $201K + up to 15.4% super,1.1,167500.0,184250.0
31,Fisheries Economist - Pacific Islands Fisherie...,Sydney,75074.5,Employment Office,Data Science,www.seek.com.au/job/38064522?type=standard&sea...,10 Jan 2019,About the OpportunityFFA has an exciting oppor...,"USD $64,018-$86,131 + accommodation & relocation!",1.0,75074.5,75074.5


In [5]:
df.drop(['job_location', 'url', 'search_category', 'job_post_date', 'job_salary', 'salary_desc', 'super', 'payt2'], axis=1, inplace=True)

In [6]:
df

Unnamed: 0,job_title,advertiser,job_desc,pay_total2
1,Data Science Delivery Lead,Intellify,About Intellify Intellify is a data science an...,165000.0
3,Machine Learning Engineer / Scientist - Optimi...,Infopeople,This role is a newly created role to work with...,160000.0
4,Data Analytics & Data Science Team Leader,AUSTRALIAN FEDERAL POLICE,The mission of the AFP is to provide dynamic a...,115046.5
8,Senior Data Science (Loyalty) :: $170K + Super...,Correlate Resources,Senior Data Scientist (Loyalty) :: $170K + Sup...,187000.0
9,Senior Data Scientist,Preacta Recruitment,Currently working for one of Australia’s leadi...,140000.0
11,Data Scientist,Talent – Winner ‘Seek Large Recruitment Agency...,Data Scientist - Cognitive Science & AIJoin th...,100000.0
15,Data Scientist,Precision Sourcing,The CompanyThis is one of the most exciting co...,145000.0
18,Senior Business Intelligence Consultant,AI Australia,As a leading Australian owned Artificial Intel...,85000.0
27,Research Scientist - Blockchain and Software ...,CSIRO,Apply your expertise in computer science/soft...,184250.0
31,Fisheries Economist - Pacific Islands Fisherie...,Employment Office,About the OpportunityFFA has an exciting oppor...,75074.5


In [7]:
df.job_desc = df.job_desc.str.replace(',', '')
df.job_desc = df.job_desc.str.replace('$', '')
df.job_desc = df.job_desc.str.replace('-', '')
df.job_desc = df.job_desc.str.replace('’', '')
df.job_desc = df.job_desc.str.replace('/', '')
df.job_desc = df.job_desc.str.replace('+', '')
df.job_desc = df.job_desc.str.replace('&', '')
df.job_desc = df.job_desc.str.replace("'", '')
df.job_desc = df.job_desc.str.replace('@', '')
df.job_desc = df.job_desc.str.replace(';', '')
df.job_desc = df.job_desc.str.replace('%', '')
df.job_desc = df.job_desc.str.replace('•', '')
df.job_desc = df.job_desc.str.replace('!', '')
df.job_desc = df.job_desc.str.replace('#', '')
df.job_desc = df.job_desc.str.replace(':', '')
df.job_desc = df.job_desc.str.replace('(', '')
df.job_desc = df.job_desc.str.replace(')', '')
df.job_desc = df.job_desc.str.replace('.', '')

In [8]:
X = df['job_desc']

In [9]:
y = df['pay_total2']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [11]:
stop = stopwords.words('english')

In [12]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [None]:
model2 = Pipeline(
            steps = [('tfidf', TfidfVectorizer(tokenizer=LemmaTokenizer(), 
                                               ngram_range=(1,2),
                                               stop_words=stop, 
                                               max_features=150,
                                               max_df = 0.5,
                                               sublinear_tf = True)), 
                      ('gs', GridSearchCV(RandomForestRegressor(),
                                          param_grid={'n_estimators': [520, 535, 550],
                                                      'max_depth' : [15,20,25]},
                                                      cv=5,
                                                      refit=True))]
                  )

model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)

print(model2.score(X_test,y_test))
print("Number of features:", len(model2.steps[0][1].get_feature_names()))

# print(model.steps[0][1].get_feature_names())
# print(model.steps[1][1].feature_importances_)
# print(model.named_steps['rfc'].feature_importances_)


print(model2.steps[1][1].best_params_)


In [18]:
def RandomForest():
    model2 = Pipeline(
                steps = [('tfidf', TfidfVectorizer(tokenizer=LemmaTokenizer(), 
                                                   ngram_range=(1,2),
                                                   stop_words=stop, 
                                                   max_features=150,
                                                   max_df = 0.2,
                                                   sublinear_tf = True)), 
                          ('rfr', RandomForestRegressor(n_estimators=550,
                                                         max_depth=25,
                                                         random_state=42))]
                      )

    model2.fit(X_train, y_train)
    y_pred = model2.predict(X_test)

    print(model2.score(X_test,y_test))
    print("Number of features:", len(model2.steps[0][1].get_feature_names()))

    # print(model.steps[0][1].get_feature_names())
    # print(model.steps[1][1].feature_importances_)
    # print(model.named_steps['rfc'].feature_importances_)



    feature_importances = pd.DataFrame(model2.steps[1][1].feature_importances_, model2.steps[0][1].get_feature_names()).reset_index()
    feature_importances.columns = ['feature', 'importance']
    features = feature_importances.sort_values('importance', ascending=False)
    print(features.sort_values('importance', ascending=False).head(60))
    

In [19]:
stop.extend(["'d", "'ll", "'re", "'s", "'ve", 'could', 'doe', 'ha', 'might', 'must', "n't", 'need', 'sha', 
             'wa', 'wo', 'would', 'business', 'analyst', 'test', 'change', 'program', 'government', 'engineer',
             'contract', 'account', 'month', 'documentation', 'detail', 'implementation', 'deliver', 'following',
             'sale', 'identify', 'multiple', 'delivering', '02', 'plan', 'document', 'experienced', 'current', 'end',
             'growth', 'financial', 'service', 'button', 'demonstrated', 'cbd', 'nsw', '·', 'big', 'enterprise',
             'user', 'diverse', 'cv', 'testing', 'finance', 'problem', 'issue', 'skills', 'experience', 'future',
             'create', 'risk', 'engagement', 'policy', 'andor', 'experience', 'working', 'discussion', 'improve',
             'software', 'please', 'apply', 'meet', 'model', 'standard', 'skill', 'impact', 'initiative', 
             'sound', '5', 'confidential', 'qualification', 'intelligence', 'focus', 'training', 'understand',
             'essential', 'seeking', 'currently', 'operational', 'employment', '–', "''", 'outcome', 'administration',
             'super', 'case', 'advice', 'platform', 'send', 'best', 'framework', 'control', '2', 'ideal', 'review',
             'insurance', '3', 'effective', 'resource', 'analyse', 'implement', 'insight', 'department', 'attention',
             'assist', 'number', 'via', 'engineering', 'analytical', 'click', 'leader', 'call', 'operation',
             'infrastructure', 'strategic', 'operation', 'ability', 'work', 'use', 'challenge', 'relevant', 'source',
             'exposure', 'digital', 'available', 'capability', 'success', 'etc', 'minimum', 'data', 'analysis',
             'considered', 'international', 'maintain', 'expertise', 'internal', 'external', 'recruitment', 'target',
             'focused', 'great', 'next', 'proven', 'improvement', 'consulting', 'around', 'state', 'performing',
             'provider', 'asset', 'u', 'practice', 'campaign', 'background', 'closely', 'innovative', 'centre',
             'transformation', '?', 'developing', 'set', 'related', 'interested', 'successful', 'candidate',
             'medium', '6', 'approach', 'area', 'right', 'range', 'able', 'managing', 'eg', 'criterion', 
             'equivalent', 'budget', 'duty', 'format', 'dynamic', 'resume', 'interest', 'person', 'day', 'desirable',
             'start', 'offer', 'preferred', 'life', 'location', 'culture', 'critical', 'help', 'order', 'functional',
             'different', '4', 'get', 'regarded', 'methodology', 'perform', 'agency', 'compliance', 'ideally', 
             'operating', 'status', 'benefit', 'idea', 'various', 'make', 'writing', 'detailed', 'rate', 'fast',
             'corporate', 'track', 'planning', 'developer', 'unit', 'value', 'decision', 'someone', 'growing',
             'mapping', 'banking', 'national', 'used', 'procedure', 'directly', 'previous', 'extensive', 'major',
             'passionate', 'time', 'commercial', 'email', 'exciting', 'technique', 'youll', 'production', 'bonus', 
             'full', 'sector', 'good', 'assessment', 'concept', 'link', 'implementing', 'high', 'level', 'positive',
             'team', 'member', 'learn', 'communicate', 'key', 'stakeholder', 'basis', 'purpose', 'salary', 'equal',
             'group', 'demonstrate', 'providing', 'verbal', 'ensuring', ',', '1', '``', 'youre', 'architecture',
             'objective', 'significant', 'fantastic', 'specification', '2019', 'activity', 'ongoing', 'place', 
             'solving', 'brand', 'making', 'daily', 'written', 'communication', 'task', 'due', 'degree', 'see',
             'find', 'leadership', 'clear', 'integrity', 'latest', 'supportive', 'learning', 'wide', 'whats', 
             'relation', 'offering', 'accounting', 'expert', 'solid', 'maintaining', 'local', 'manager',
             'step', 'engage', 'record', 'web', 'monitoring', 'initial', 'note', 'power', 'balance', 'important', 
             'api', 'investment', 'reference', 'flexible', 'applicant', 'small', 'world', 'involved', 'profile',
             'present', 'personal', 'discipline', 'vendor'])


In [20]:
RandomForest()


0.18331329880477054
Number of features: 150
                feature  importance
5                 agile    0.068299
74          integration    0.057619
13                   ba    0.049483
14                 bank    0.045184
148                word    0.042148
114          regulatory    0.035620
108  project management    0.034265
21                cloud    0.033526
120            security    0.032134
127                 sql    0.029107
91            modelling    0.028922
11           automation    0.022667
110              python    0.022289
57                excel    0.017494
77           javascript    0.016980
18             capacity    0.014839
42             database    0.012819
144       visualisation    0.011660
6             analytics    0.010520
16                   bi    0.010060
93              network    0.009768
117              retail    0.009767
105             prepare    0.008879
99              payment    0.008690
76                 java    0.007894
138              tim