# 0. Импорт данных

In [1]:
import zipfile
from itertools import chain

import kaggle
import numpy as np
import pandas as pd
from imblearn.combine import SMOTEENN
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, precision_score, recall_score, f1_score
)
from sklearn.model_selection import HalvingRandomSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer

np.random.seed(0)
%matplotlib inline

In [7]:
kaggle.api.authenticate()
kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.')

with zipfile.ZipFile('real-or-fake-fake-jobposting-prediction.zip', 'r') as zip_ref:
    zip_ref.extractall('real-or-fake-job')

In [2]:
data = pd.read_csv(r'real-or-fake-job\fake_job_postings.csv', index_col='job_id')
data

Unnamed: 0_level_0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0
17879,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


# 1. Анализ данных
Таблица состоит из 18 столбцов:

    * job_id - Unique Job ID
    * title - The title of the job ad entry.
    * location - Geographical location of the job ad.
    * department - Corporate department (e.g. sales).
    * salary_range - Indicative salary range (e.g. $50,000-$60,000)
    * company_profile - A brief company description.
    * description - The details description of the job ad.
    * requirements - Enlisted requirements for the job opening.
    * benefits - Enlisted offered benefits by the employer.
    * telecommuting - True for telecommuting positions.
    * has_company_logo - True if company logo is present.
    * has_questions - True if screening questions are present.
    * employment_type - Full-type, Part-time, Contract, etc.
    * required_experience - Executive, Entry level, Intern, etc.
    * required_education - Doctorate, Master’s Degree, Bachelor, etc.
    * industry - Automotive, IT, Health care, Real estate, etc.
    * function - Consulting, Engineering, Research, Sales etc.
    * fraudulent target - Classification attribute.

Признаки разделяются на следующие типы и в зависимости от типа признака будут по разному обработаны:

    * числовые (salary_range) - взятие среднего значение и нормализация.
    * текстовые (title, location, department, company_profile, description, requirements, benefits, industry, function) - векторизация.
    * категориальные (employment_type, required_experience, required_education) - one-hot encoding.
    * бинарные (telecommuting, has_company_logo, has_questions)

Посмотрим, сколько значенйи пропущено в каждом столбце:

In [3]:
for col in data:
    print(col, data[data[col].isna()].shape)

title (0, 17)
location (346, 17)
department (11547, 17)
salary_range (15012, 17)
company_profile (3308, 17)
description (1, 17)
requirements (2695, 17)
benefits (7210, 17)
telecommuting (0, 17)
has_company_logo (0, 17)
has_questions (0, 17)
employment_type (3471, 17)
required_experience (7050, 17)
required_education (8105, 17)
industry (4903, 17)
function (6455, 17)
fraudulent (0, 17)


Признак *salary_range* лучше удалить, т.к. более 80% значений пропущены. Но, в случае, если его использовать, то необходимо использовать слежующий алгоритм обработки:

    1) Разделить на два числа по тире
    2) Преобразовать месяцы в числа. Здесь есть какая-то ошибка автокодирования, из-за которого появляются значения типа 10-Oct, Oct-20, Dec-25, на самом деле это 10-10, 10-20, 12-25.
    3) Если число  меньше 1000, то это значение нужно умножить на 1000
    4) Взять среднее
    5) Так как разброс значений большой (от 0 до 1000000000), то все значения, которые больше, чем 95% заменить на значение этого квантиля, чтобы избавиться от больших выбросов. Так как нормализация чувствительна к сильным выбросам.
    6) Нормализация по минимальному и максимальному значениям.

In [4]:
textual_data = data[['title', 'location', 'department', 'company_profile',
                     'description', 'requirements', 'benefits', 'industry', 'function']]

tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit(chain.from_iterable(textual_data.fillna('-').values))
print(f'Length of vocab is {tfidf.vocabulary_.__len__()}')

Length of vocab is 120960


In [5]:
def transform_categorical(data):
    res = []
    for col in ['employment_type', 'required_experience', 'required_education']:
        res.append(pd.get_dummies(data[col]))
    return pd.concat(res, axis=1).astype('float64')

def transform_bool(data):
    return data[['telecommuting', 'has_company_logo', 'has_questions']].astype('float64')

def transform_textual(data):
    textual_data = data[['title', 'location', 'department', 'company_profile',
                         'description', 'requirements', 'benefits', 'industry', 'function']]
    textual_data = textual_data.fillna('-').agg(' '.join, axis=1)
    return tfidf.transform(textual_data).astype('float64')


categorical_transformer = FunctionTransformer(transform_categorical)
bool_transformer = FunctionTransformer(transform_bool)
textual_transformer = FunctionTransformer(transform_textual)

text_union = FeatureUnion([
    ('categorical_features', categorical_transformer),
    ('bool_features', bool_transformer),
    ('textual_features', textual_transformer)
])

cat_features = pd.concat([transform_categorical(data), transform_bool(data)], axis=1)
text_features = text_union.transform(data)
text_features

<17880x120988 sparse matrix of type '<class 'numpy.float64'>'
	with 4091551 stored elements in Compressed Sparse Row format>

## Корреляции

In [6]:
corr = cat_features.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Contract,Full-time,Other,Part-time,Temporary,Associate,Director,Entry level,Executive,Internship,Mid-Senior level,Not Applicable,Associate Degree,Bachelor's Degree,Certification,Doctorate,High School or equivalent,Master's Degree,Professional,Some College Coursework Completed,Some High School Coursework,Unspecified,Vocational,Vocational - Degree,Vocational - HS Diploma,telecommuting,has_company_logo,has_questions
Contract,1.0,-0.415882,-0.034614,-0.065933,-0.03568,-0.048956,-0.037284,0.011821,-0.00457,-0.014528,-0.018912,-0.012521,-0.023405,0.219652,0.017564,-0.011649,-0.082642,-0.01921,0.024,-0.001846,-0.011871,-0.018712,-0.008339,-0.005593,-0.00685,0.061905,0.027283,0.100808
Full-time,-0.415882,1.0,-0.154497,-0.294282,-0.159253,0.18789,0.098207,0.081002,0.049531,-0.08412,0.275069,0.023131,0.049567,0.177493,0.00546,0.028009,0.064447,0.072059,0.012617,-0.033142,-0.040906,0.090043,0.004834,0.000644,0.016471,-0.035566,0.027778,-0.063046
Other,-0.034614,-0.154497,1.0,-0.024493,-0.013255,-0.025622,0.00021,-0.007315,-0.004462,0.180442,-0.041919,0.069863,-0.006013,-0.031249,0.004333,-0.004327,0.016504,-0.004246,-0.00731,0.071005,-0.00441,0.033996,0.013167,-0.002078,-0.002545,0.010508,0.009244,0.034353
Part-time,-0.065933,-0.294282,-0.024493,1.0,-0.025248,-0.020563,-0.030354,0.142197,-0.019257,0.114508,-0.084581,0.046217,0.034829,-0.09778,0.009557,-0.008243,0.182819,-0.015359,0.007183,0.066403,0.005559,0.013863,0.019781,0.010839,-0.004847,0.01446,9.7e-05,0.001679
Temporary,-0.03568,-0.159253,-0.013255,-0.025248,1.0,0.02035,-0.017432,0.004944,-0.010421,0.184266,-0.045416,0.011945,-0.002737,-0.033585,0.028531,-0.004461,0.001459,-0.001953,-0.007535,0.049105,-0.004546,0.070788,0.021705,0.024339,-0.002623,-0.019959,0.023239,0.012122
Associate,-0.048956,0.18789,-0.025622,-0.020563,0.02035,1.0,-0.057256,-0.161814,-0.034229,-0.056651,-0.199755,-0.09906,0.133074,0.132927,0.022667,0.016053,-0.012622,-0.001599,0.019509,0.017524,-0.014931,0.068215,0.034227,-0.007034,-0.001164,0.002859,0.09163,0.079075
Director,-0.037284,0.098207,0.00021,-0.030354,-0.017432,-0.057256,1.0,-0.062853,-0.013296,-0.022005,-0.077591,-0.038478,0.006363,0.06526,-0.00671,0.014431,-0.051718,0.121941,0.014272,-0.006206,-0.0058,-0.000562,-0.007818,-0.002732,-0.003347,0.011944,0.018649,0.009754
Entry level,0.011821,0.081002,-0.007315,0.142197,0.004944,-0.161814,-0.062853,1.0,-0.037576,-0.06219,-0.219283,-0.108744,0.019935,-0.022805,-0.005866,-0.007881,0.428551,-0.043279,-0.022302,0.009575,0.076173,0.03801,0.010788,0.009342,0.004476,0.022602,0.068194,-0.048193
Executive,-0.00457,0.049531,-0.004462,-0.019257,-0.010421,-0.034229,-0.013296,-0.037576,1.0,-0.013155,-0.046386,-0.023003,-0.005975,0.03272,-0.008735,0.046378,-0.030376,0.078518,0.0238,-0.006753,-0.003467,-0.014175,-0.004674,-0.001633,-0.002001,0.034175,-0.009618,0.009697
Internship,-0.014528,-0.08412,0.180442,0.114508,0.184266,-0.056651,-0.022005,-0.06219,-0.013155,1.0,-0.076771,-0.038071,0.000509,-0.013374,0.001507,-0.005631,-0.006429,0.010625,0.008585,0.230511,-0.005738,0.017651,-0.000327,-0.002703,-0.003311,0.012722,0.037425,0.026845


Слабая корреляция (0.3 < |R| < 0.5) присутствует между признаками:

    * Contract и Full-time (-0.42) - взаимоисключающие опции одного признака
    * High School or equivalent и Entry level (0.43) - выпускников часто ищут именно на начальные позиции


In [7]:
target = data['fraudulent']
target.value_counts()

0    17014
1      866
Name: fraudulent, dtype: int64

Классы несбалансированы почти в **20** раз!

# 2. Модели

Попробуем обучить классификаторы только на текстовых признаках, только на категориальных, и на совмещенных признаках.

In [8]:
def train_clf(clf_class, X, y, **kwargs):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    clf = clf_class(**kwargs)
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)

def experiment(clf, **kwargs):
    print(f'{clf.__name__} on categorical features accuracy:'
          f' {train_clf(clf, cat_features, target, **kwargs):.3f}')
    print(f'{clf.__name__} on textual features accuracy:'
          f' {train_clf(clf, transform_textual(data), target, **kwargs):.3f}')
    print(f'{clf.__name__} on union features accuracy:'
          f' {train_clf(clf, text_features, target, **kwargs):.3f}')


experiment(LogisticRegression, random_state=0)
experiment(KNeighborsClassifier)
experiment(RandomForestClassifier, random_state=0)

LogisticRegression on categorical features accuracy: 0.958
LogisticRegression on textual features accuracy: 0.976
LogisticRegression on union features accuracy: 0.976
KNeighborsClassifier on categorical features accuracy: 0.957
KNeighborsClassifier on textual features accuracy: 0.982
KNeighborsClassifier on union features accuracy: 0.979
RandomForestClassifier on categorical features accuracy: 0.957
RandomForestClassifier on textual features accuracy: 0.984
RandomForestClassifier on union features accuracy: 0.985


Все-таки все признаки вносят вклад в модель, поэтому лучшее качество **0.985** на совмещенных признаках у леса деревьев.

## Важность признаков

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    text_features, target, test_size=0.2, random_state=0)

logreg = LogisticRegression(random_state=0)
logreg.fit(X_train, y_train)
logreg_features = logreg.coef_[0]
logreg_top10 = np.argpartition(logreg_features, -10)[-10:]

for k, v in tfidf.vocabulary_.items():
    if v in logreg_top10:
        print(f'Word {k} has weight {logreg_features[v]}')

Word AccountDependent has weight 2.7701397757986217
Word 000Meal has weight 2.7159860617747893
Word Adops has weight 3.5695161365494954
Word Olympic has weight 1.8607027883058316
Word Arbeitsumfeld has weight 2.3739897779532417
Word easement has weight 1.8697318746051776
Word youthExperience has weight 2.3652728394439255
Word engineeringSolid has weight 2.1003938412940135
Word industryCompetitive has weight 2.0890663928513975
Word oldPreferably has weight 1.8075386728494434


In [10]:
random_forest = RandomForestClassifier(random_state=0)
random_forest.fit(X_train, y_train)
rf_features = random_forest.feature_importances_
rf_top10 = np.argpartition(rf_features, -10)[-10:]

for k, v in tfidf.vocabulary_.items():
    if v in rf_top10:
        print(f'Word {k} has weight {rf_features[v]}')

Word walking has weight 0.003031792882986422
Word AWSCreating has weight 0.0029608718509696952
Word owner has weight 0.005076562889752473
Word Albert has weight 0.0028765710489904823
Word redeemable has weight 0.003839936915774571
Word enunciation has weight 0.003484200462736079
Word Nobilissimus has weight 0.003096621478906701
Word Adops has weight 0.005667907379732615
Word 000Incentivised has weight 0.006201302971007807
Word ofertą has weight 0.0028392508995288986


Что касается важности фичей, в данном случае отдельных слов, то большие значения получают какие-то рандомные токены, которые, вероятно, только 1 раз встретились в корпусе.

## Метрики

In [11]:
def get_metrics(y_true, y_pred):
    print(f'Accuracy {accuracy_score(y_true, y_pred):.3f}')
    print(f'Precision {precision_score(y_true, y_pred):.3f}')
    print(f'Recall {recall_score(y_true, y_pred):.3f}')
    print(f'F1-Score {f1_score(y_true, y_pred):.3f}')
    print(classification_report(y_true, y_pred))

get_metrics(y_test, logreg.predict(X_test))

Accuracy 0.976
Precision 0.924
Recall 0.477
F1-Score 0.629
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3423
           1       0.92      0.48      0.63       153

    accuracy                           0.98      3576
   macro avg       0.95      0.74      0.81      3576
weighted avg       0.97      0.98      0.97      3576



In [12]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
get_metrics(y_test, knn.predict(X_test))

Accuracy 0.979
Precision 0.802
Recall 0.686
F1-Score 0.739
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3423
           1       0.80      0.69      0.74       153

    accuracy                           0.98      3576
   macro avg       0.89      0.84      0.86      3576
weighted avg       0.98      0.98      0.98      3576



In [13]:
get_metrics(y_test, random_forest.predict(X_test))

Accuracy 0.985
Precision 1.000
Recall 0.654
F1-Score 0.791
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3423
           1       1.00      0.65      0.79       153

    accuracy                           0.99      3576
   macro avg       0.99      0.83      0.89      3576
weighted avg       0.99      0.99      0.98      3576



По полученным метрикам самой "сбалансированной" моделью является лес деревьев.

# 3. Балансировка классов

In [14]:
target.value_counts()

0    17014
1      866
Name: fraudulent, dtype: int64

In [15]:
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(text_features, target)
y_resampled.value_counts()

1    17012
0    12543
Name: fraudulent, dtype: int64

In [16]:
score = train_clf(RandomForestClassifier, X_resampled, y_resampled, random_state=0)
print(f'Accuracy after classes balancing {score:.3f}')

Accuracy after classes balancing 0.999


# 4. Подбор гиперпраметров

In [17]:
clf = RandomForestClassifier(random_state=0)
param_distributions = {
    'max_depth': randint(5, 100),
    'min_samples_split': randint(10, 100),
    'min_samples_leaf': randint(10, 100)
}

search = HalvingRandomSearchCV(
    clf, param_distributions, resource='n_estimators',
    max_resources=100, random_state=0, verbose=1
).fit(X_resampled, y_resampled)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 1
max_resources_: 100
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 100
n_resources: 1
Fitting 5 folds for each of 100 candidates, totalling 500 fits
----------
iter: 1
n_candidates: 34
n_resources: 3
Fitting 5 folds for each of 34 candidates, totalling 170 fits
----------
iter: 2
n_candidates: 12
n_resources: 9
Fitting 5 folds for each of 12 candidates, totalling 60 fits
----------
iter: 3
n_candidates: 4
n_resources: 27
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 4
n_candidates: 2
n_resources: 81
Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [18]:
search.best_score_, search.best_params_

(0.9928269328370835,
 {'max_depth': 87,
  'min_samples_leaf': 10,
  'min_samples_split': 24,
  'n_estimators': 81})

# 5. CatBoost

In [20]:
from catboost import CatBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=0)
model = CatBoostClassifier(
    iterations=50,
    custom_loss='Accuracy'
)
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=1,
    plot=True
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.25478
0:	learn: 0.4883003	test: 0.4844507	best: 0.4844507 (0)	total: 8.88s	remaining: 7m 14s
1:	learn: 0.3584132	test: 0.3555083	best: 0.3555083 (1)	total: 13.2s	remaining: 5m 17s
2:	learn: 0.2972923	test: 0.2972605	best: 0.2972605 (2)	total: 17.8s	remaining: 4m 39s
3:	learn: 0.2481425	test: 0.2477771	best: 0.2477771 (3)	total: 21.1s	remaining: 4m 2s
4:	learn: 0.2154389	test: 0.2150227	best: 0.2150227 (4)	total: 24.7s	remaining: 3m 42s
5:	learn: 0.1906420	test: 0.1906798	best: 0.1906798 (5)	total: 29.1s	remaining: 3m 33s
6:	learn: 0.1751110	test: 0.1752472	best: 0.1752472 (6)	total: 32.7s	remaining: 3m 20s
7:	learn: 0.1632956	test: 0.1637465	best: 0.1637465 (7)	total: 35.9s	remaining: 3m 8s
8:	learn: 0.1509821	test: 0.1510663	best: 0.1510663 (8)	total: 39s	remaining: 2m 57s
9:	learn: 0.1414864	test: 0.1428801	best: 0.1428801 (9)	total: 42.5s	remaining: 2m 49s
10:	learn: 0.1328346	test: 0.1342772	best: 0.1342772 (10)	total: 46.6s	remaining: 2m 45s
11:	learn: 0.124