# Pseudolabelling

## Load data

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18']

## Pseudolabeling single loop

In [4]:
from categorical_transform import IntegerCategoricalTransform
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

In [5]:
cbc = CatBoostClassifier(cat_features=cat_cols, thread_count=6, verbose=False)
catboost_pipe = Pipeline([("trans", IntegerCategoricalTransform(cat_cols)),
                          ("catboost", cbc)])

In [None]:
import numpy as np
kf = KFold(n_splits=5, shuffle=True)
roc_test_augmentation = []
roc_test_original = []
for train_index, test_index in kf.split(train):
    train_folds = train.loc[train_index]
    test_fold = train.loc[test_index]
    x_train = train_folds.drop(columns=['id','target'])
    y_train = train_folds['target'] 
    catboost_pipe.fit(x_train, y_train)
    
    x_test = test_fold.drop(columns=['id','target'])
    y_test = test_fold['target'] 
    proba_orig = catboost_pipe.predict_proba(x_test)[:,1]
    roc_test_original.append(roc_auc_score(y_test, proba_orig))
    
    unlabelled_train = pd.concat([x_test, test.drop(columns=['id'])], ignore_index=True)
    y_augmented = catboost_pipe.predict(unlabelled_train)
    y_augmented_proba = catboost_pipe.predict_proba(unlabelled_train)[:,1]
    idx = (y_augmented_proba>0.95) | (y_augmented_proba<0.05)
    print(f"augment size: {np.sum(idx)}, {100*np.sum(idx)/len(idx)}%")
    y = pd.concat([pd.Series(y_augmented[idx]), y_train], ignore_index=True)
    x = pd.concat([unlabelled_train[idx], x_train], ignore_index=True)
    catboost_pipe.fit(x, y)
                                  
    proba = catboost_pipe.predict_proba(x_test)[:,1]
    roc_test_augmentation.append(roc_auc_score(y_test, proba))

In [None]:
np.mean(roc_test_augmentation)

In [None]:
np.mean(roc_test_original)

In [None]:
results = pd.DataFrame({'orig':roc_test_original,'augmented':roc_test_augmentation})
results['improvement'] = results['augmented']>results['orig']
results