In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer 
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('train.tsv',sep='\t')
tags = ["part-time-job", "full-time-job", "hourly-wage", "salary", "associate-needed", "bs-degree-needed", "ms-or-phd-needed", "licence-needed", "1-year-experience-needed", "2-4-years-experience-needed", "5-plus-years-experience-needed", "supervising-job"]
data.head()

Unnamed: 0,tags,description
0,licence-needed supervising-job 5-plus-years-ex...,THE COMPANY Employer is a midstream service...
1,2-4-years-experience-needed salary full-time-job,ICR Staffing is now accepting resumes for Indu...
2,part-time-job,This is a great position for the right person....
3,licence-needed,A large multi-specialty health center is expan...
4,5-plus-years-experience-needed full-time-job b...,JOB PURPOSE: The Account Director is respon...


#### Preprocessing data

In [4]:
sno = SnowballStemmer(language='english')

for index, row in data.iterrows():   
    row.description = row.description.lower()
    row.description = row.description.split(' ')
    temp_list = []
    for word in row.description:
        temp_list.append(sno.stem(word.decode('utf-8')))
    row.description = ' '.join(temp_list)

In [5]:
y = []

for tag in tags:
    y_curr = []
    for tags_list in data.tags:
        if (type(tags_list) != float) and (tag in tags_list):
            y_curr.append(1)
        else:
            y_curr.append(0)
    y.append(y_curr)
y = np.array(y).T

Unfortunaltely, the Indeed contest has already finished, so I have to use train dataset for both: training and evaluation. Also the implementation my own funtion for evaluation models is below. This is the same as was used during the contest. 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data.description, y, test_size=0.2)

In [7]:
def score(predictions, true_values, tags):
    if predictions.shape != true_values.shape:
        print 'Error: dimensions must be the same'
        return
    STP, SFP, STN, SFN = 0, 0, 0, 0
    for i in xrange(len(tags)):
        STP += sum([1 if true_values[j,i]==1 and predictions[j,i]==1 else 0 for j in xrange(len(predictions))])
        SFP += sum([1 if true_values[j,i]==0 and predictions[j,i]==1 else 0 for j in xrange(len(predictions))])
        STN += sum([1 if true_values[j,i]==0 and predictions[j,i]==0 else 0 for j in xrange(len(predictions))])
        SFN += sum([1 if true_values[j,i]==1 and predictions[j,i]==0 else 0 for j in xrange(len(predictions))])
    P = float(STP)/(STP+SFP)
    R = float(STP)/(STP+SFN)
    return P, R, 2*P*R/(P+R)

### LinearSVC

In [8]:
def CV_score(clf_pipeline, num_cv=10):
    predictions = []
    cross_val_scores = []
    for i,t in enumerate(tags):
        curr_score = cross_val_score(clf_pipeline, X_train, y_train[:,i], scoring='f1', cv=num_cv, n_jobs=-1).mean()
        cross_val_scores.append(curr_score)
    return sum(cross_val_scores)/len(cross_val_scores)

In [9]:
choose_C = [5,6,7,8,9,10,11,12]
choose_use_idf = [False] #True doesn't help

best_score = 0
best_params = {}
for c in choose_C:
    for use_idf in choose_use_idf:
        clf_pipeline = Pipeline(
                [("vectorizer", CountVectorizer(ngram_range=(1,3))),
                ("tfidf", TfidfTransformer(use_idf=use_idf)),
                ("classifier", LinearSVC(C=c))]
            )
        current_score = CV_score(clf_pipeline, num_cv=5)
        
        if best_score < current_score:
            best_score = current_score
            best_params = {'C': c, 'use_idf': use_idf}

In [10]:
print 'best average score on CV is % s with params %s' % (best_score, best_params)

best average score on CV is 0.421795632206 with params {'use_idf': False, 'C': 10}


In [11]:
def predict_y_test(clf_pipeline):
    predictions = []
    for i,t in enumerate(tags):
        clf_pipeline.fit(X_train, y_train[:,i])
        predictions.append(clf_pipeline.predict(X_test))
    predictions = np.array(predictions).T
    return predictions, score(predictions, y_test, tags)

In [14]:
clf_pipeline = Pipeline(
                [("vectorizer", CountVectorizer(ngram_range=(1,3))),
                ("tfidf", TfidfTransformer(use_idf=best_params['use_idf'])),
                ("classifier", LinearSVC(C=best_params['C']))])
    
predictions, score_test = predict_y_test(clf_pipeline)

print 'precision %s recall %s f1-score %s' % score_test

precision 0.633579725449 recall 0.443131462334 f1-score 0.521512385919


### Multi classification

In [15]:
tags_job = [0, 1]
tags_edu = [5, 6]
tags_exp = [8, 9, 10]

In [16]:
def predict_multi(tags_subset, clf_pipeline):
    predictions = []
    y_curr_tags = np.zeros(y_train.shape[0])
    y_curr_tags_test = np.zeros(y_test.shape[0])
    
    for i, tag_number in enumerate(tags_subset):
        for j in xrange(y_train.shape[0]):
            if y_train[j, tag_number] == 1:
                y_curr_tags[j] = i+1
        
        for j in xrange(y_test.shape[0]):
            if y_test[j, tag_number] == 1:
                y_curr_tags_test[j] = i+1
        
        
    clf_pipeline.fit(X_train, y_curr_tags)

    predictions = clf_pipeline.predict(X_test)
    
    predictions_binary = np.zeros((len(y_test), len(tags_subset)))
    for i, pr in enumerate(predictions):
        if pr == 1:
            predictions_binary[i, 0] = 1
        if pr == 2:
            predictions_binary[i, 1] = 1
        if (len(tags_subset) == 3) and (pr == 3):
            predictions_binary[i, 2] = 1
    
    local_score = score(predictions_binary, y_test[:, tags_subset[0]:tags_subset[-1]+1], tags_subset)
    
    return predictions_binary, local_score

In [21]:
# tags_job
clf_pipeline_multi_job = Pipeline(
            [("vectorizer", CountVectorizer(ngram_range=(1,3))),
            ("tfidf", TfidfTransformer(use_idf=False)),
            ("classifier", LinearSVC(C=10))]
        )
predictions_job, local_score = predict_multi(tags_job, clf_pipeline_multi_job)
print 'job local f1-score:', local_score

job local f1-score: (0.643312101910828, 0.42083333333333334, 0.5088161209068011)


In [22]:
# tags_edu
clf_pipeline_multi_edu = Pipeline(
            [("vectorizer", CountVectorizer(ngram_range=(1,3))),
            ("tfidf", TfidfTransformer(use_idf=False)),
            ("classifier", LinearSVC(C=10))]
        )

predictions_edu, local_score = predict_multi(tags_edu, clf_pipeline_multi_edu)
print 'education local f1-score:', local_score

education local f1-score: (0.7487684729064039, 0.7638190954773869, 0.7562189054726368)


In [23]:
# tags_experience
clf_pipeline_multi_exp = Pipeline(
            [("vectorizer", CountVectorizer(ngram_range=(1,3))),
            ("tfidf", TfidfTransformer(use_idf=False)),
            ("classifier", LinearSVC(C=10))]
        )
predictions_exp, local_score = predict_multi(tags_exp, clf_pipeline_multi_exp)
print 'experience local f1-score', local_score

experience local f1-score (0.4835443037974684, 0.4775, 0.4805031446540881)


Look at every feature as a binary feature. Which of them has a bad score?
Аnd let's find the most important feature.

In [25]:
other_predictions = []

clf_pipeline_rest = Pipeline(
            [("vectorizer", CountVectorizer(ngram_range=(1,3))),
            ("tfidf", TfidfTransformer(use_idf=False)),
            ("classifier", LinearSVC(C=10))]
        )

for t in xrange(len(tags)):
    clf_pipeline_rest.fit(X_train, y_train[:,t])
    local_prediction = clf_pipeline_rest.predict(X_test)
    other_predictions.append(local_prediction)
    
    print tags[t], f1_score(y_test[:,t],local_prediction)
    
    most_imp = np.argmax(clf_pipeline_rest.steps[2][1].coef_)
    vocabulary = clf_pipeline_rest.steps[0][1].vocabulary_
    for i in vocabulary.keys():
        if vocabulary[i] == most_imp:
            print '\'most important feature\':', i

other_predictions = np.array(other_predictions).T

part-time-job 0.514285714286
'most important feature': part time
full-time-job 0.470149253731
'most important feature': full
hourly-wage 0.530303030303
'most important feature': hour
salary 0.411764705882
'most important feature': salari
associate-needed 0.1
'most important feature': associ degre
bs-degree-needed 0.774869109948
'most important feature': bachelor
ms-or-phd-needed 0.153846153846
'most important feature': master
licence-needed 0.472727272727
'most important feature': licens
1-year-experience-needed 0.0
'most important feature': one year
2-4-years-experience-needed 0.436363636364
'most important feature': year
5-plus-years-experience-needed 0.561403508772
'most important feature': year
supervising-job 0.558303886926
'most important feature': supervis


We see that tags about years of experience has the same most important features. It explains bad score at this tags. Multiclass classification at experience features should help.

In [29]:
#use all multiclass
matrix_predictions = np.hstack((predictions_job, other_predictions[:,2:5], predictions_edu, 
                                other_predictions[:,7:8], predictions_exp, other_predictions[:, 11:12]))
score(matrix_predictions, y_test, tags)

(0.603578154425612, 0.47271386430678464, 0.5301902398676592)

In [30]:
#use job and edu multiclass
matrix_predictions = np.hstack((predictions_job, other_predictions[:,2:5], predictions_edu, 
                                other_predictions[:,7:]))
score(matrix_predictions, y_test, tags)

(0.6242171189979123, 0.4410029498525074, 0.5168539325842697)

In [31]:
#use exp and edu multiclass
matrix_predictions = np.hstack((other_predictions[:,:5], predictions_edu, 
                                other_predictions[:,7:8], predictions_exp, other_predictions[:, 11:12]))
score(matrix_predictions, y_test, tags)

(0.6069364161849711, 0.4646017699115044, 0.5263157894736842)

Tried but wasn't helpful:

In [32]:
#use job and exp multiclass
matrix_predictions = np.hstack((predictions_job, other_predictions[:,2:8],  
                                predictions_exp, other_predictions[:, 11:12]))
score(matrix_predictions, y_test, tags)

(0.604739336492891, 0.47050147492625366, 0.5292409788469514)

In [33]:
#use only job multiclass:
matrix_predictions = np.hstack((predictions_job, other_predictions[:,2:]))
score(matrix_predictions, y_test, tags)

(0.6256572029442692, 0.4387905604719764, 0.5158214130905937)

In [34]:
#use only edu multiclass:
matrix_predictions = np.hstack((other_predictions[:,:5], predictions_edu, 
                                other_predictions[:,7:]))
score(matrix_predictions, y_test, tags)

(0.6284796573875803, 0.43289085545722716, 0.5126637554585153)

In [35]:
#use only exp multiclass
matrix_predictions = np.hstack((other_predictions[:,:8], predictions_exp, other_predictions[:, 11:12]))
score(matrix_predictions, y_test, tags)

(0.6081474296799224, 0.46238938053097345, 0.5253456221198156)

Let's look at predictions at random description from X_test

In [40]:
def get_tags(prediction):
    tag_list = []
    for i,t in enumerate(tags):
        if prediction[i] == 1:
            tag_list.append(t)
    return ' '.join(tag_list)

In [36]:
#use all multiclass
matrix_predictions = np.hstack((predictions_job, other_predictions[:,2:5], predictions_edu, 
                                other_predictions[:,7:8], predictions_exp, other_predictions[:, 11:12]))
score(matrix_predictions, y_test, tags)

(0.603578154425612, 0.47271386430678464, 0.5301902398676592)

In [56]:
get_tags(matrix_predictions[25])

'bs-degree-needed 2-4-years-experience-needed'

In [57]:
X_test.values[25]

u'junior web design will primarili assist in process a high volum of product imag follow a strict guid standard. this posit work with the sr. design on market collater such as advertisements, sale promotions, web support and email projects, as well as imag retouching. the posit initi design, primarili work under creativ direct from senior team members. must have creativ abil and be abl to follow instructions, work on multipl project simultaneously, and maintain attent to detail. the posit requir keen typograph skill as well as full knowledg of prepar file for export and web. the appropri candid will thrive in a fast-paced, deadline-ori environment.  *responsibilities:* * crop and basic photo retouch of product images. * manag multipl workflow in a deadlin driven environment. * strong communic skill * assist maintain e-commerc website, promot materi and email market campaigns.\u200b  * formul layout for design & visual communic that will detail style, fonts, artwork, graphics.\u200b * m

In [48]:
get_tags(matrix_predictions[4])

'salary supervising-job'

In [52]:
X_test.values[4]

u'passag malibu, a drug and alcohol treatment center is seek an experienc controller.   you will be primarili respons for financi reporting, budgeting, project models, busi plans, and manag a staff of 6.   the posit requir a hand on approach with staff both in and out of your department. leadership and the abil to problem solv is a must.   minimum of 5 year controller/senior account execut experi is required. activ cpa required; mba preferred. healthcar experi is a plus, but not mandatory.   ideally, you will have work in a compani that has over 100 employe and $50 mil in revenue.   this role offer a competit base salary, health benefits, paid vacat and holidays.  the posit provid an outstand work environ in malibu.  salari up to $150k base on experience.   qualifi candid are encourag to respond to this job posting. pleas enclos a current resum and cover letter detail salari history.   all inquir are held in strict confidence.'

In [53]:
get_tags(matrix_predictions[10])

'full-time-job hourly-wage'

In [54]:
X_test.values[10]

u'we are in need of b-lingual candid to work in a inbound call center to answer question from the general public regard govern programs. work shift between 7:00 a.m and 7:00 p.m this is a long, long term temporari assign with paid vacation, paid holidays, and referr fees. assign in a profession environ downtown baltimor access to public transportation.  . this is not a telemarket position, collect or sales. this is a genuin custom servic position. in downtown baltimor work 40 hour a week this is a sever year contract. pay rate is $12.91 per hour.  if you have excel custom servic skills, speak spanish fluentli and abl to read and speak english with keyboard skill .  you do not have to have call center experience. if you work in the food industry, hospit  field, and retail but do not have ani experi work in an offic this is your opportun to go in to offic work. pleas submit your resum as soon as possibl call center open on august 1st, paid train in july.'