# Pseudolabelling

## Load data

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18']

## Pseudolabeling single loop

In [14]:
from categorical_transform import IntegerCategoricalTransform, CategoricalTransform
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

In [15]:
cbc = CatBoostClassifier(cat_features=cat_cols, thread_count=6, verbose=False)
catboost_pipe = Pipeline([("trans", IntegerCategoricalTransform(cat_cols)),
                          ("catboost", cbc)])

In [19]:
lightgbm_pipe = Pipeline([("trans", CategoricalTransform(cat_cols)),
                          ("lgbm", LGBMClassifier(n_jobs = -2))])
pipe = lightgbm_pipe

In [26]:
import numpy as np
kf = KFold(n_splits=5, shuffle=True)
roc_test_augmentation = []
roc_test_original = []
for train_index, test_index in kf.split(train):
    train_folds = train.loc[train_index]
    test_fold = train.loc[test_index]
    x_train = train_folds.drop(columns=['id','target'])
    y_train = train_folds['target'] 
    pipe.fit(x_train, y_train)
    
    x_test = test_fold.drop(columns=['id','target'])
    y_test = test_fold['target'] 
    proba_orig = pipe.predict_proba(x_test)[:,1]
    roc_test_original.append(roc_auc_score(y_test, proba_orig))
    
    unlabelled_train = pd.concat([x_test, test.drop(columns=['id'])], ignore_index=True)
    y_augmented = pipe.predict(unlabelled_train)
    y_augmented_proba = pipe.predict_proba(unlabelled_train)[:,1]
    idx = (y_augmented_proba>0.75) | (y_augmented_proba<0.25)
    print(f"augment size: {np.sum(idx)}, {100*np.sum(idx)/len(idx)}%")
    y = pd.concat([pd.Series(y_augmented[idx]), y_train], ignore_index=True)
    x = pd.concat([unlabelled_train[idx], x_train], ignore_index=True)
    pipe.fit(x, y)
                                  
    proba = pipe.predict_proba(x_test)[:,1]
    roc_test_augmentation.append(roc_auc_score(y_test, proba))

augment size: 200879, 77.26115384615385%
augment size: 201520, 77.50769230769231%
augment size: 201052, 77.3276923076923%
augment size: 201251, 77.40423076923076%
augment size: 201319, 77.43038461538461%


In [27]:
np.mean(roc_test_augmentation)

0.8907374152993913

In [28]:
np.mean(roc_test_original)

0.8914026098742303

In [30]:
results = pd.DataFrame({'orig':roc_test_original,'augmented':roc_test_augmentation})
results['improvement'] = results['augmented']>results['orig']
results

Unnamed: 0,orig,augmented,improvement
0,0.890264,0.890025,False
1,0.892024,0.890966,False
2,0.891674,0.891054,False
3,0.892453,0.891804,False
4,0.890598,0.889838,False
