In [1]:
library(NLP)
library(tm)
library(SnowballC)
library(qdap)
library("fastDummies")
library(caret)
library(stringr)

df <- read.csv("job_skill_short.csv")

drops <- c("X","Company")
df <- df[ , !(names(df) %in% drops)]

df["Location"] <- as.factor(df[["Location"]])
df["Category"] <- as.factor(df[["Category"]])

#initialize new columns 
df$min_bs <-0
df$min_ms <- 0 
df$min_mba <-0 
df$min_phd <-0 
df$min_jd <-0

df$pref_bs <-0
df$pref_ms <- 0 
df$pref_mba <-0 
df$pref_phd <-0 
df$pref_jd <-0
##minimum qualifications columns
df$min_bs <- ifelse(grepl("BS|BA|Bachelors|Bachelor's", df$Minimum.Qualifications), 1, 0)
df$min_ms <- ifelse(grepl("MS|MA|MST|Masters|Master's|MFA",df$Minimum.Qualifications),1,0)
df$min_mba <- ifelse(grepl("MBA",df$Minimum.Qualifications),1,0)
df$min_phd <-ifelse(grepl("PhD|Ph.D",df$Minimum.Qualifications),1,0)
df$min_jd <-ifelse(grepl("JD|J.D.",df$Minimum.Qualifications),1,0)

## preferred qualification columns
df$pref_bs <- ifelse(grepl("BS|BA|Bachelors|Bachelor's", df$Preferred.Qualifications), 1, 0)
df$pref_ms <- ifelse(grepl("MS|MA|MST|Masters|Master's|MFA",df$Preferred.Qualifications),1,0)
df$pref_mba <- ifelse(grepl("MBA",df$Preferred.Qualifications),1,0)
df$pref_phd <-ifelse(grepl("PhD|Ph.D",df$Preferred.Qualifications),1,0)
df$pref_jd <-ifelse(grepl("JD|J.D.",df$Preferred.Qualifications),1,0)

regexp <- "\\d+\\b(?=\\syears)"
df$min_years_exp<-as.numeric(str_extract(df$Minimum.Qualifications, regexp))
df$min_years_exp[is.na(df$min_years_exp)]<-0

regexp2 <- "\\d+\\b(?=\\syears)" 
df$pref_years_exp<-as.numeric(str_extract(df$Preferred.Qualifications, regexp2))
df$pref_years_exp[is.na(df$pref_years_exp)]<-0

tfFromFeature <- function(source_df, colname){
    corpus = VCorpus(VectorSource(source_df[[colname]]))
    corpus = tm_map(corpus, stripWhitespace)
    corpus = tm_map(corpus, removePunctuation)
    corpus = tm_map(corpus, content_transformer(tolower))
    corpus = tm_map(corpus, removeWords, stopwords("en"))

    corpus = tm_map(corpus, stemDocument)
    df = as.matrix(removeSparseTerms(DocumentTermMatrix(corpus), .99))
    colnames(df) <- lapply(colnames(df), function(word){return(paste(colname, "_", word, collapse =""))})
    return(df)
}

text_features <- c("Title", "Responsibilities", "Minimum.Qualifications", "Preferred.Qualifications")
df_bow <- df[ , !(names(df) %in% text_features)]
for (x in text_features){
    df_bow <- cbind(df_bow, tfFromFeature(df, x))
}
df_bow

Loading required package: qdapDictionaries
Loading required package: qdapRegex
Loading required package: qdapTools
Loading required package: RColorBrewer

Attaching package: ‘qdap’

The following objects are masked from ‘package:tm’:

    as.DocumentTermMatrix, as.TermDocumentMatrix

The following object is masked from ‘package:NLP’:

    ngrams

The following object is masked from ‘package:base’:

    Filter

Loading required package: lattice
Loading required package: ggplot2

Attaching package: ‘ggplot2’

The following object is masked from ‘package:qdapRegex’:

    %+%

The following object is masked from ‘package:NLP’:

    annotate


Attaching package: ‘stringr’

The following object is masked from ‘package:qdap’:

    %>%



Category,Location,min_bs,min_ms,min_mba,min_phd,min_jd,pref_bs,pref_ms,pref_mba,⋯,Preferred.Qualifications _ wide,Preferred.Qualifications _ willing,Preferred.Qualifications _ within,Preferred.Qualifications _ work,Preferred.Qualifications _ worker,Preferred.Qualifications _ write,Preferred.Qualifications _ written,Preferred.Qualifications _ writtenverb,Preferred.Qualifications _ xml,Preferred.Qualifications _ year
Program Management,Singapore,1,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,0,0,0,0
Manufacturing & Supply Chain,"Shanghai, China",1,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Technical Solutions,"New York, NY, United States",0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Developer Relations,"Mountain View, CA, United States",1,0,0,0,0,0,0,0,⋯,0,0,0,2,0,0,0,0,0,0
Program Management,"Sunnyvale, CA, United States",1,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
Technical Solutions,"Dublin, Ireland",1,0,0,0,0,0,0,0,⋯,0,0,1,4,0,0,0,0,0,0
Hardware Engineering,"Mountain View, CA, United States",1,0,0,0,0,0,1,0,⋯,0,0,0,0,0,0,0,0,0,0
Partnerships,"Sunnyvale, CA, United States",1,0,0,0,0,1,0,0,⋯,0,0,0,1,0,0,0,0,0,2
Manufacturing & Supply Chain,"Xinyi District, Taiwan",1,0,0,0,0,1,0,1,⋯,0,1,0,1,0,0,0,0,0,0
Technical Solutions,"New York, NY, United States",1,0,0,0,0,0,1,0,⋯,0,0,0,3,0,0,0,0,0,1


In [2]:
set.seed(123)
trainIndex <- createDataPartition(df_bow[["Category"]], p = .8, list = FALSE, times = 1)

train <- df_bow[trainIndex,]
test <- df_bow[-trainIndex,]

fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 1)

In [3]:
params = data.frame(n.trees = c(60,80,100,120,140), 
                    interaction.depth = c(3,3,3,3,3), 
                    shrinkage = c(0.1, 0.1, 0.1, 0.1, 0.1),
                    n.minobsinnode = c(10,10,10,10,10))
gbmFit1 <- train(Category~., data = train, method = "gbm", trControl = fitControl, tuneGrid = params)
gbmFit1

“variable 80: LocationThe Dalles, OR, United States has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        3.1355             nan     0.1000    1.1252
     2        2.2071             nan     0.1000    0.4296
     3        1.8893             nan     0.1000    0.2506
     4        1.6908             nan     0.1000    0.2020
     5        1.5314             nan     0.1000    0.1815
     6        1.3941             nan     0.1000    0.1376
     7        1.2749             nan     0.1000    0.1023
     8        1.1769             nan     0.1000    0.0874
     9        1.0921             nan     0.1000    0.0638
    10        1.0151             nan     0.1000    0.0640
    20        0.5612             nan     0.1000    0.0076
    40        0.2470             nan     0.1000   -0.0050
    60        0.1328             nan     0.1000   -0.0041
    80        0.0799             nan     0.1000   -0.0033
   100        0.0505             nan     0.1000   -0.0032
   120        0.0331             nan     0.1000   -0.0014
   140        

“variable 90: LocationZagreb, Croatia has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        3.1355             nan     0.1000    1.2424
     2        2.2086             nan     0.1000    0.4636
     3        1.8949             nan     0.1000    0.2850
     4        1.6741             nan     0.1000    0.2191
     5        1.5055             nan     0.1000    0.1538
     6        1.3739             nan     0.1000    0.1249
     7        1.2680             nan     0.1000    0.1141
     8        1.1667             nan     0.1000    0.0932
     9        1.0820             nan     0.1000    0.0901
    10        1.0024             nan     0.1000    0.0563
    20        0.5529             nan     0.1000    0.0117
    40        0.2293             nan     0.1000   -0.0051
    60        0.1200             nan     0.1000   -0.0051
    80        0.0731             nan     0.1000   -0.0031
   100        0.0470             nan     0.1000   -0.0025
   120        0.0316             nan     0.1000   -0.0016
   140        

“variable 84: LocationVilnius, Lithuania has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        3.1355             nan     0.1000    1.3431
     2        2.1389             nan     0.1000    0.3565
     3        1.8702             nan     0.1000    0.2901
     4        1.6565             nan     0.1000    0.2256
     5        1.4868             nan     0.1000    0.1786
     6        1.3506             nan     0.1000    0.1273
     7        1.2418             nan     0.1000    0.1043
     8        1.1390             nan     0.1000    0.0771
     9        1.0586             nan     0.1000    0.0669
    10        0.9861             nan     0.1000    0.0475
    20        0.5368             nan     0.1000    0.0097
    40        0.2157             nan     0.1000   -0.0010
    60        0.1131             nan     0.1000   -0.0055
    80        0.0652             nan     0.1000   -0.0032
   100        0.0405             nan     0.1000   -0.0019
   120        0.0262             nan     0.1000   -0.0014
   140        

“variable 80: LocationThe Dalles, OR, United States has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        3.1355             nan     0.1000    1.3309
     2        2.1709             nan     0.1000    0.3931
     3        1.8814             nan     0.1000    0.2918
     4        1.6801             nan     0.1000    0.1905
     5        1.5192             nan     0.1000    0.1854
     6        1.3800             nan     0.1000    0.1331
     7        1.2617             nan     0.1000    0.1119
     8        1.1555             nan     0.1000    0.1007
     9        1.0625             nan     0.1000    0.0718
    10        0.9902             nan     0.1000    0.0531
    20        0.5391             nan     0.1000    0.0022
    40        0.2263             nan     0.1000   -0.0098
    60        0.1222             nan     0.1000   -0.0028
    80        0.0714             nan     0.1000   -0.0034
   100        0.0453             nan     0.1000   -0.0021
   120        0.0293             nan     0.1000   -0.0012
   140        

“variable 83: LocationVienna, Austria has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        3.1355             nan     0.1000    1.2507
     2        2.2194             nan     0.1000    0.4023
     3        1.8985             nan     0.1000    0.2662
     4        1.6767             nan     0.1000    0.1976
     5        1.5070             nan     0.1000    0.1904
     6        1.3701             nan     0.1000    0.1297
     7        1.2578             nan     0.1000    0.1046
     8        1.1584             nan     0.1000    0.0677
     9        1.0773             nan     0.1000    0.0676
    10        1.0036             nan     0.1000    0.0557
    20        0.5638             nan     0.1000   -0.0005
    40        0.2442             nan     0.1000   -0.0066
    60        0.1327             nan     0.1000   -0.0056
    80        0.0808             nan     0.1000   -0.0035
   100        0.0534             nan     0.1000   -0.0020
   120        0.0370             nan     0.1000   -0.0016
   140        

“variable 80: LocationThe Dalles, OR, United States has no variation.”

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        3.1355             nan     0.1000    1.3944
     2        2.1103             nan     0.1000    0.3682
     3        1.8295             nan     0.1000    0.2792
     4        1.6332             nan     0.1000    0.2125
     5        1.4818             nan     0.1000    0.1529
     6        1.3598             nan     0.1000    0.1345
     7        1.2507             nan     0.1000    0.0898
     8        1.1639             nan     0.1000    0.0877
     9        1.0876             nan     0.1000    0.0779
    10        1.0121             nan     0.1000    0.0569
    20        0.5775             nan     0.1000    0.0080
    40        0.2557             nan     0.1000   -0.0062
    60        0.1462             nan     0.1000   -0.0026
    80        0.0928             nan     0.1000   -0.0035



Stochastic Gradient Boosting 

 990 samples
1656 predictors
  23 classes: 'Administrative', 'Business Strategy', 'Data Center & Network', 'Developer Relations', 'Finance', 'Hardware Engineering', 'IT & Data Management', 'Legal & Government Relations', 'Manufacturing & Supply Chain', 'Marketing & Communications', 'Network Engineering', 'Partnerships', 'People Operations', 'Product & Customer Support', 'Program Management', 'Real Estate & Workplace Services', 'Sales & Account Management', 'Sales Operations', 'Software Engineering', 'Technical Infrastructure', 'Technical Solutions', 'Technical Writing', 'User Experience & Design' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 1 times) 
Summary of sample sizes: 794, 792, 791, 792, 791 
Resampling results across tuning parameters:

  n.trees  Accuracy   Kappa    
   60      0.8131514  0.7969371
   80      0.8152127  0.7989886
  100      0.8061164  0.7892483
  120      0.8091161  0.7924280
  140      0.8081623  0.7913701



In [None]:
m_pls <- train(Category~., data=train, method="widekernelpls", trControl = fitControl,
               tuneGrid = data.frame(ncomp = c(50,55,60,65,70,75,80,85,90)), maxit = 1000)
m_pls

In [None]:
m_log <- train(Category~., data=train, method="regLogistic", tuneLength = 10, loss = L1, trControl = fitControl)
m_log

In [None]:
m_gauss <- train(Category~., data=train, method='gaussprLinear')
m_gauss

In [None]:
categories = levels(df_bow$Category)
models = vector(,length(categories))
for (i in 1:length(categories)){
    cat_name <- paste("Category_", categories[i])
    y <- ifelse(as.character(df_bow$Category) == categories[i], 1, 0)
    y <- as.factor(y)
    x <- df_bow[ , !(names(df_bow) == "Category")]
    ith_model <- train(x = x, y = y,
                       method='adaboost', trControl = fitControl, tuneLength = 1, verbose = TRUE)
    models[i] <- ith_model
    print(ith_model)
}

In [4]:
logistic_l1 <- train(Category~., data=train, method='regLogistic', 
                   trControl = fitControl, tuneGrid = data.frame(cost = c(2.0), epsilon = c(.001), loss = c("L1")))

$TypeDetail
[1] "L1-regularized logistic regression (L1R_LR)"

$Type
[1] 6

$W
                                 LocationAnn Arbor, MI, United States
Program Management                                                  0
Manufacturing & Supply Chain                                        0
Technical Solutions                                                 0
Developer Relations                                                 0
Hardware Engineering                                                0
Partnerships                                                        0
Product & Customer Support                                          0
Software Engineering                                                0
Data Center & Network                                               0
Business Strategy                                                   0
Technical Writing                                                   0
Technical Infrastructure                                            0
IT & Data M

In [10]:
gbmImp <- varImp(gbmFit1, scale = FALSE)
gbmImp

ERROR: Error in relative.influence(object, n.trees = numTrees): could not find function "relative.influence"


In [6]:
svmFit <- train(Category~., data = train, 
                 method = "svmRadial", 
                 trControl = fitControl,
                 preProc = c("center", "scale"),
                 tuneLength = 8,
                 metric = "ROC")

ERROR: Error: Class probabilities are needed to score models using the area under the ROC curve. Set `classProbs = TRUE` in the trainControl() function.


In [11]:
logistic_l1$finalModel

$TypeDetail
[1] "L1-regularized logistic regression (L1R_LR)"

$Type
[1] 6

$W
                                 LocationAnn Arbor, MI, United States
Program Management                                                  0
Manufacturing & Supply Chain                                        0
Technical Solutions                                                 0
Developer Relations                                                 0
Hardware Engineering                                                0
Partnerships                                                        0
Product & Customer Support                                          0
Software Engineering                                                0
Data Center & Network                                               0
Business Strategy                                                   0
Technical Writing                                                   0
Technical Infrastructure                                            0
IT & Data M

In [53]:
coeffs = logistic_l1$finalModel$W
categories = levels(df_bow$Category)
selected_vars = setNames(data.frame(matrix(ncol = length(categories), nrow = 1)), categories)
for (feature in coeffs){
#    print(feature)
}

features = colnames(coeffs)
#logistic_l1$finalModel$obsLevels