In [1]:
import warnings

warnings.simplefilter(action='ignore')

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import balanced_accuracy_score, classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [3]:
sns.set()
tqdm.pandas()

In [4]:
DATA_PATH = 'data/dataset8.pkl'
RANDOM_STATE = 42

# Загрузка данных

In [5]:
df = pd.read_pickle(DATA_PATH)
df.head()

Unnamed: 0,id,responsibilities_bigrams,class
0,9495846,"(работа, на, строительных, площадках)",2.0
1,9495846,"(на, строительных, площадках, очистных)",2.0
2,9495846,"(строительных, площадках, очистных, сооружениях)",2.0
3,9495850,"(гнутье, арматурной, стали, на)",2.0
4,9495850,"(арматурной, стали, на, механических)",2.0


In [6]:
vacancies = df['id'].unique()
df.index = df['id']

In [7]:
kf = KFold(n_splits=5)
results = pd.DataFrame()

## LogisticRegression(random_state=RANDOM_STATE) + CountVectorizer

In [8]:
kfold_results_acc = []
kfold_results_f1 = []
for i, (train_index, test_index) in enumerate(kf.split(vacancies)):
    print(f"Fold {i}:")
    
    train = df.loc[vacancies[train_index]]
    test = df.loc[vacancies[test_index]]
    print(f'Train shape: {train.shape}, test shape: {test.shape}')

    count_vect = CountVectorizer(preprocessor=lambda x:x,
                                 tokenizer=lambda x:x)
    X_train = count_vect.fit_transform(doc[:-1] for doc in train['responsibilities_bigrams'])

    lr = LogisticRegression(random_state=RANDOM_STATE)
    lr.fit(X_train, train['class'])

    X_test = count_vect.transform(test['responsibilities_bigrams'])
    pred = lr.predict(X_test)
    kfold_results_acc.append(balanced_accuracy_score(test['class'], pred))
    kfold_results_f1.append(f1_score(test['class'], pred, average='weighted'))
    print('==' * 20)
    print(classification_report(test['class'], pred))
    print('==' * 20)

mdl = f'LogisticRegression(random_state={RANDOM_STATE})'
results.loc['balanced_accuracy_score', mdl] = np.mean(kfold_results_acc)
results.loc['f1_score', mdl] = np.mean(kfold_results_f1)

Fold 0:
Train shape: (61297, 3), test shape: (19220, 3)
              precision    recall  f1-score   support

         0.0       0.77      0.68      0.72      2644
         1.0       0.74      0.89      0.81      7523
         2.0       0.85      0.73      0.78      9053

    accuracy                           0.79     19220
   macro avg       0.78      0.77      0.77     19220
weighted avg       0.79      0.79      0.79     19220

Fold 1:
Train shape: (62564, 3), test shape: (17953, 3)
              precision    recall  f1-score   support

         0.0       0.75      0.58      0.66      3135
         1.0       0.82      0.84      0.83      8223
         2.0       0.73      0.79      0.76      6595

    accuracy                           0.77     17953
   macro avg       0.77      0.74      0.75     17953
weighted avg       0.77      0.77      0.77     17953

Fold 2:
Train shape: (62016, 3), test shape: (18501, 3)
              precision    recall  f1-score   support

         0.0   

# LogisticRegression(random_state=RANDOM_STATE, max_iter=500, n_jobs=-1) + CountVectorizer

In [9]:
kfold_results_acc = []
kfold_results_f1 = []
for i, (train_index, test_index) in enumerate(kf.split(vacancies)):
    print(f"Fold {i}:")
    
    train = df.loc[vacancies[train_index]]
    test = df.loc[vacancies[test_index]]
    print(f'Train shape: {train.shape}, test shape: {test.shape}')

    count_vect = CountVectorizer(preprocessor=lambda x:x,
                                 tokenizer=lambda x:x)
    X_train = count_vect.fit_transform(doc[:-1] for doc in train['responsibilities_bigrams'])

    lr = LogisticRegression(random_state=RANDOM_STATE, max_iter=500, n_jobs=-1)
    lr.fit(X_train, train['class'])

    X_test = count_vect.transform(test['responsibilities_bigrams'])
    pred = lr.predict(X_test)
    kfold_results_acc.append(balanced_accuracy_score(test['class'], pred))
    kfold_results_f1.append(f1_score(test['class'], pred, average='weighted'))
    print('==' * 20)
    print(classification_report(test['class'], pred))
    print('==' * 20)

mdl = f'LogisticRegression(random_state={RANDOM_STATE}, max_iter=500)'
results.loc['balanced_accuracy_score', mdl] = np.mean(kfold_results_acc)
results.loc['f1_score', mdl] = np.mean(kfold_results_f1)

Fold 0:
Train shape: (61297, 3), test shape: (19220, 3)
              precision    recall  f1-score   support

         0.0       0.77      0.68      0.72      2644
         1.0       0.74      0.90      0.81      7523
         2.0       0.85      0.73      0.78      9053

    accuracy                           0.79     19220
   macro avg       0.79      0.77      0.77     19220
weighted avg       0.80      0.79      0.79     19220

Fold 1:
Train shape: (62564, 3), test shape: (17953, 3)
              precision    recall  f1-score   support

         0.0       0.75      0.58      0.65      3135
         1.0       0.81      0.83      0.82      8223
         2.0       0.73      0.79      0.76      6595

    accuracy                           0.77     17953
   macro avg       0.76      0.73      0.75     17953
weighted avg       0.77      0.77      0.77     17953

Fold 2:
Train shape: (62016, 3), test shape: (18501, 3)
              precision    recall  f1-score   support

         0.0   

# LinearSVC(random_state=RANDOM_STATE) + CountVectorizer

In [10]:
kfold_results_acc = []
kfold_results_f1 = []
for i, (train_index, test_index) in enumerate(kf.split(vacancies)):
    print(f"Fold {i}:")
    
    train = df.loc[vacancies[train_index]]
    test = df.loc[vacancies[test_index]]
    print(f'Train shape: {train.shape}, test shape: {test.shape}')

    count_vect = CountVectorizer(preprocessor=lambda x:x,
                                 tokenizer=lambda x:x)
    X_train = count_vect.fit_transform(doc[:-1] for doc in train['responsibilities_bigrams'])

    lr = LinearSVC(random_state=RANDOM_STATE)
    lr.fit(X_train, train['class'])

    X_test = count_vect.transform(test['responsibilities_bigrams'])
    pred = lr.predict(X_test)
    kfold_results_acc.append(balanced_accuracy_score(test['class'], pred))
    kfold_results_f1.append(f1_score(test['class'], pred, average='weighted'))
    print('==' * 20)
    print(classification_report(test['class'], pred))
    print('==' * 20)

mdl = f'LinearSVC(random_state={RANDOM_STATE})'
results.loc['balanced_accuracy_score', mdl] = np.mean(kfold_results_acc)
results.loc['f1_score', mdl] = np.mean(kfold_results_f1)

Fold 0:
Train shape: (61297, 3), test shape: (19220, 3)
              precision    recall  f1-score   support

         0.0       0.73      0.69      0.71      2644
         1.0       0.73      0.89      0.80      7523
         2.0       0.84      0.71      0.77      9053

    accuracy                           0.78     19220
   macro avg       0.77      0.76      0.76     19220
weighted avg       0.79      0.78      0.78     19220

Fold 1:
Train shape: (62564, 3), test shape: (17953, 3)
              precision    recall  f1-score   support

         0.0       0.71      0.59      0.65      3135
         1.0       0.81      0.83      0.82      8223
         2.0       0.74      0.77      0.76      6595

    accuracy                           0.77     17953
   macro avg       0.75      0.73      0.74     17953
weighted avg       0.77      0.77      0.77     17953

Fold 2:
Train shape: (62016, 3), test shape: (18501, 3)
              precision    recall  f1-score   support

         0.0   

# LogisticRegression(random_state=RANDOM_STATE, max_iter=500, n_jobs=-1) + TfidfVectorizer

In [11]:
kfold_results_acc = []
kfold_results_f1 = []
for i, (train_index, test_index) in enumerate(kf.split(vacancies)):
    print(f"Fold {i}:")
    
    train = df.loc[vacancies[train_index]]
    test = df.loc[vacancies[test_index]]
    print(f'Train shape: {train.shape}, test shape: {test.shape}')

    vect = TfidfVectorizer(preprocessor=lambda x:x,
                                 tokenizer=lambda x:x)
    X_train = vect.fit_transform(doc[:-1] for doc in train['responsibilities_bigrams'])

    lr = LogisticRegression(random_state=RANDOM_STATE, max_iter=500, n_jobs=-1)
    lr.fit(X_train, train['class'])

    X_test = vect.transform(test['responsibilities_bigrams'])
    pred = lr.predict(X_test)
    kfold_results_acc.append(balanced_accuracy_score(test['class'], pred))
    kfold_results_f1.append(f1_score(test['class'], pred, average='weighted'))
    print('==' * 20)
    print(classification_report(test['class'], pred))
    print('==' * 20)

mdl = f'LogisticRegression(random_state={RANDOM_STATE}, max_iter=500) + TFIDF'
results.loc['balanced_accuracy_score', mdl] = np.mean(kfold_results_acc)
results.loc['f1_score', mdl] = np.mean(kfold_results_f1)

Fold 0:
Train shape: (61297, 3), test shape: (19220, 3)
              precision    recall  f1-score   support

         0.0       0.81      0.65      0.72      2644
         1.0       0.75      0.89      0.82      7523
         2.0       0.84      0.75      0.79      9053

    accuracy                           0.79     19220
   macro avg       0.80      0.77      0.78     19220
weighted avg       0.80      0.79      0.79     19220

Fold 1:
Train shape: (62564, 3), test shape: (17953, 3)
              precision    recall  f1-score   support

         0.0       0.79      0.55      0.65      3135
         1.0       0.82      0.83      0.83      8223
         2.0       0.72      0.81      0.76      6595

    accuracy                           0.77     17953
   macro avg       0.78      0.73      0.75     17953
weighted avg       0.78      0.77      0.77     17953

Fold 2:
Train shape: (62016, 3), test shape: (18501, 3)
              precision    recall  f1-score   support

         0.0   

# LinearSVC(random_state=RANDOM_STATE, C=0.2) + CountVectorizer

In [12]:
kfold_results_acc = []
kfold_results_f1 = []
for i, (train_index, test_index) in enumerate(kf.split(vacancies)):
    print(f"Fold {i}:")
    
    train = df.loc[vacancies[train_index]]
    test = df.loc[vacancies[test_index]]
    print(f'Train shape: {train.shape}, test shape: {test.shape}')

    count_vect = CountVectorizer(preprocessor=lambda x:x,
                                 tokenizer=lambda x:x)
    X_train = count_vect.fit_transform(doc[:-1] for doc in train['responsibilities_bigrams'])

    lr = LinearSVC(random_state=RANDOM_STATE, C=0.2)
    lr.fit(X_train, train['class'])

    X_test = count_vect.transform(test['responsibilities_bigrams'])
    pred = lr.predict(X_test)
    kfold_results_acc.append(balanced_accuracy_score(test['class'], pred))
    kfold_results_f1.append(f1_score(test['class'], pred, average='weighted'))
    print('==' * 20)
    print(classification_report(test['class'], pred))
    print('==' * 20)

mdl = f'LinearSVC(random_state={RANDOM_STATE}, C=0.2)'
results.loc['balanced_accuracy_score', mdl] = np.mean(kfold_results_acc)
results.loc['f1_score', mdl] = np.mean(kfold_results_f1)

Fold 0:
Train shape: (61297, 3), test shape: (19220, 3)
              precision    recall  f1-score   support

         0.0       0.76      0.68      0.72      2644
         1.0       0.74      0.90      0.81      7523
         2.0       0.85      0.73      0.78      9053

    accuracy                           0.79     19220
   macro avg       0.78      0.77      0.77     19220
weighted avg       0.79      0.79      0.78     19220

Fold 1:
Train shape: (62564, 3), test shape: (17953, 3)
              precision    recall  f1-score   support

         0.0       0.75      0.59      0.66      3135
         1.0       0.81      0.83      0.82      8223
         2.0       0.73      0.79      0.76      6595

    accuracy                           0.77     17953
   macro avg       0.77      0.74      0.75     17953
weighted avg       0.77      0.77      0.77     17953

Fold 2:
Train shape: (62016, 3), test shape: (18501, 3)
              precision    recall  f1-score   support

         0.0   

# LogisticRegression(random_state=RANDOM_STATE, max_iter=500, n_jobs=-1, class_weight={0: 1.2, 1: 1.1, 2: 0.7}) + TfidfVectorizer

In [13]:
kfold_results_acc = []
kfold_results_f1 = []
for i, (train_index, test_index) in enumerate(kf.split(vacancies)):
    print(f"Fold {i}:")
    
    train = df.loc[vacancies[train_index]]
    test = df.loc[vacancies[test_index]]
    print(f'Train shape: {train.shape}, test shape: {test.shape}')

    vect = TfidfVectorizer(preprocessor=lambda x:x,
                                 tokenizer=lambda x:x)
    X_train = vect.fit_transform(doc[:-1] for doc in train['responsibilities_bigrams'])

    lr = LogisticRegression(random_state=RANDOM_STATE, max_iter=500, n_jobs=-1, class_weight={0: 1.2, 1: 1.1, 2: 0.7})
    lr.fit(X_train, train['class'])

    X_test = vect.transform(test['responsibilities_bigrams'])
    pred = lr.predict(X_test)
    kfold_results_acc.append(balanced_accuracy_score(test['class'], pred))
    kfold_results_f1.append(f1_score(test['class'], pred, average='weighted'))
    print('==' * 20)
    print(classification_report(test['class'], pred))
    print('==' * 20)

mdl = f'LogisticRegression(random_state={RANDOM_STATE}, max_iter=500) + TFIDF + weights'
results.loc['balanced_accuracy_score', mdl] = np.mean(kfold_results_acc)
results.loc['f1_score', mdl] = np.mean(kfold_results_f1)

Fold 0:
Train shape: (61297, 3), test shape: (19220, 3)
              precision    recall  f1-score   support

         0.0       0.76      0.69      0.72      2644
         1.0       0.72      0.92      0.81      7523
         2.0       0.86      0.69      0.77      9053

    accuracy                           0.78     19220
   macro avg       0.78      0.77      0.77     19220
weighted avg       0.79      0.78      0.78     19220

Fold 1:
Train shape: (62564, 3), test shape: (17953, 3)
              precision    recall  f1-score   support

         0.0       0.73      0.60      0.66      3135
         1.0       0.80      0.86      0.83      8223
         2.0       0.76      0.76      0.76      6595

    accuracy                           0.78     17953
   macro avg       0.76      0.74      0.75     17953
weighted avg       0.77      0.78      0.77     17953

Fold 2:
Train shape: (62016, 3), test shape: (18501, 3)
              precision    recall  f1-score   support

         0.0   

# Сравнение результатов

In [14]:
results

Unnamed: 0,LogisticRegression(random_state=42),"LogisticRegression(random_state=42, max_iter=500)",LinearSVC(random_state=42),"LogisticRegression(random_state=42, max_iter=500) + TFIDF","LinearSVC(random_state=42, C=0.2)","LogisticRegression(random_state=42, max_iter=500) + TFIDF + weights"
balanced_accuracy_score,0.741544,0.742183,0.737799,0.73889,0.742187,0.743609
f1_score,0.768846,0.769099,0.761824,0.771397,0.769039,0.765898


В результате выбрана лучшая модель - LogisticRegression(random_state=RANDOM_STATE, max_iter=500, n_jobs=-1) + TfidfVectorizer