In [1]:
#load python packages
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

In [2]:
#This is a hack to cut down on the pandas/sklearn warnings.
#My notebook runs fine so I wanted to make it more readable.
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
def get_file(file_name):
    path = os.getcwd()
    path += "/../data/processed/"+file_name
    #have to eval text columns (which are currently lists of strings) or pandas will read a list as one big string instead
    df = pd.read_csv(path, index_col='job_id',\
                     converters={'company_profile': eval,'description': eval, 'requirements':eval})
    return df

In [4]:
X_train = get_file('fake_job_postings_X_train.csv')
y_train = get_file('fake_job_postings_y_train.csv')
X_test = get_file('fake_job_postings_X_test.csv')
y_test = get_file('fake_job_postings_y_test.csv')

Model: Naive Bayes

In [5]:
def MultinomialNB_predict(df_col,y):
    df_col_together = df_col.str.join(sep=' ')
    cv = CountVectorizer()
    df_col_tf = cv.fit_transform(df_col_together)
    mnb = MultinomialNB()
    return cross_val_score(mnb,df_col_tf,y, n_jobs=-1, scoring='f1').mean()

In [6]:
word_features = ['company_profile', 'description', 'requirements'] 
for feature in word_features:
    print('feature: '+feature+'\nF1: ' +str(MultinomialNB_predict(X_train[feature],y_train))+'\n')

feature: company_profile
F1: 0.3661781462868491

feature: description
F1: 0.36058733496247414

feature: requirements
F1: 0.21513278384806314



In [7]:
combined_text_cols = X_train['company_profile']+ X_train['description']+ X_train['requirements']
print('feature: '+'all text combined'+'\nF1: ' +str(MultinomialNB_predict(combined_text_cols,y_train))+'\n')

feature: all text combined
F1: 0.45366096507074855



Model: Logistic Regression

In [8]:
X_train_no_words = X_train.drop(word_features, axis=1)
logreg_model = LogisticRegression()
cross_val_score(logreg_model,X_train_no_words,y_train, n_jobs=-1, scoring='f1').mean()

0.34650865035930445

Model: Random Forest

In [9]:
randforest_model = RandomForestClassifier()
cross_val_score(randforest_model,X_train_no_words,y_train, n_jobs=-1, scoring='f1').mean()

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.7130882941573236

Conclusion: Best model and performance on hold-out data

Our best model so far (measured by F1 score) has been the out of the box RandomForestClassifier from sklearn, using the non-word data. I'd like to end the modelling section by training this model on all of our training data and then scoring based on the test data.

In [10]:
final_model = RandomForestClassifier()
final_model.fit(X_train_no_words, y_train)
X_test_no_words = X_test.drop(word_features, axis=1)
y_pred = final_model.predict(X_test_no_words)
print(metrics.classification_report(y_test, y_pred))

  


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3777
           1       0.97      0.59      0.73       196

    accuracy                           0.98      3973
   macro avg       0.97      0.80      0.86      3973
weighted avg       0.98      0.98      0.98      3973



In [11]:
print(metrics.confusion_matrix(y_test,y_pred))

[[3773    4]
 [  80  116]]


The performance is about as good as I was hoping for!

In [27]:
importances = final_model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")

for f in range(X_train_no_words.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, list(X_train_no_words.columns)[(indices[f])], importances[indices[f]]))

Feature ranking:
1. feature description_length (0.129250)
2. feature company_profile_length (0.108250)
3. feature requirements_length (0.105511)
4. feature has_company_logo (0.046858)
5. feature title_data (0.031137)
6. feature has_benefits (0.028322)
7. feature has_department (0.025121)
8. feature function_Administrative (0.024366)
9. feature has_questions (0.024172)
10. feature has_required_experience (0.021834)
11. feature Country_US (0.018993)
12. feature has_salary_range (0.018954)
13. feature has_required_education (0.016935)
14. feature State_Other (0.016856)
15. feature function_Other (0.016587)
16. feature industry_Other (0.015864)
17. feature employment_type_Part-time (0.015566)
18. feature employment_type_Full-time (0.014963)
19. feature function_Engineering (0.014862)
20. feature City_  (0.013349)
21. feature State_ CA (0.012292)
22. feature City_Other (0.011942)
23. feature employment_type_Other (0.011622)
24. feature Country_Other (0.009854)
25. feature industry_Hospital 