In [20]:
import numpy as np
import pandas as pd
import sklearn
import xgboost



In [21]:
print(np.__version__)
print(pd.__version__)
print(sklearn.__version__)
print(xgboost.__version__)

1.12.1
0.20.1
0.18.1
0.6


## Data preprocessing

In [3]:
train = pd.read_csv('dataset/train.csv.zip')
test = pd.read_csv('dataset/test.csv.zip')

In [4]:
print("train data:", train.shape)
print("test data:", test.shape)

train data: (4209, 378)
test data: (4209, 377)


In [5]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

features = train.columns[2:]

for column_name in features:
    label_encoder = LabelEncoder()
    
    # get the column values
    train_column_values = list(train[column_name].values)
    test_column_values = list(test[column_name].values)
    
    # Fit the label encoder
    label_encoder.fit(train_column_values + test_column_values)
    
    # Transform the feature
    train[column_name] = label_encoder.transform(train_column_values)
    test[column_name] = label_encoder.transform(test_column_values)

In [12]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,28,23,20,0,3,27,9,14,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,28,21,22,4,3,31,11,14,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,15,24,38,2,3,30,9,23,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,15,21,38,5,3,30,11,4,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,15,23,38,5,3,14,3,13,...,0,0,0,0,0,0,0,0,0,0


In [13]:
test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,68,23,38,5,3,26,0,22,0,...,0,0,0,1,0,0,0,0,0,0
1,2,89,3,9,0,3,9,6,24,0,...,0,0,1,0,0,0,0,0,0,0
2,3,68,23,19,5,3,0,9,9,0,...,0,0,0,1,0,0,0,0,0,0
3,4,68,13,38,5,3,32,11,13,0,...,0,0,0,1,0,0,0,0,0,0
4,5,92,20,19,2,3,31,8,12,0,...,1,0,0,0,0,0,0,0,0,0


## Pseudo-labeling with scikit-learn

In [16]:
def create_augmented_train(X, y, model, test, features, target, sample_rate):
    
    num_of_samples = int(len(test) * sample_rate)
    
    # Train the model and create the pseudo-labels
    model.fit(X, y)
    pseudo_labels = model.predict(test[features])
    
    # Add the pseudo-labels to the test set
    augmented_test = test.copy(deep=True)
    augmented_test[target] = pseudo_labels
    
    # Take a subset of the test set with pseudo-labels and append into the training set
    sampled_test = augmented_test.sample(n=num_of_samples)
    temp_train = pd.concat([X, y], axis=1)
    augmented_train = pd.concat([sampled_test, temp_train])
    
    # Shuffle the augmented dataset
    return shuffle(augemented_train)

In [23]:
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, RegressorMixin
from xgboost import XGBRegressor

class PseudoLabel(BaseEstimator, RegressorMixin):
    
    def __init__(self, model, test, features, target, sample_rate=0.2, seed=42):
        self.sample_rate = sample_rate
        self.seed = seed
        self.model = model
        self.model.seed = seed
        
        self.test = test
        self.features = features
        self.target = target
        
    def get_params(self, deep=True):
        return {
            "sample_rate": self.sample_rate,
            "seed": self.seed,
            "model": self.model,
            "test": self.test,
            "features": self.features,
            "target": self.target
        }
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def fit(self, X, y):
        if self.sample_rate > 0.0:
            augemented_train = self.__create_augmented_train(X, y)
            self.model.fit(
                augemented_train[self.features],
                augemented_train[self.target]
            )
        else:
            self.model.fit(X, y)
        
        return self

    def __create_augmented_train(self, X, y):
        num_of_samples = int(len(test) * self.sample_rate)
        
        # Train the model and creat the pseudo-labels
        self.model.fit(X, y)
        pseudo_labels = self.model.predict(self.test[self.features])
        
        # Add the pseudo-labels to the test set
        augmented_test = test.copy(deep=True)
        augmented_test[self.target] = pseudo_labels
        
        # Take a subset of the test set with pseudo-labels and append in onto
        # the training set
        sampled_test = augmented_test.sample(n=num_of_samples)
        temp_train = pd.concat([X, y], axis=1)
        augemented_train = pd.concat([sampled_test, temp_train])
        
        return shuffle(augemented_train)    
    
    def predict(self, X):
        return self.model.predict(X)
    
    def get_model_name(self):
        return self.model.__class__.__name__
    

In [24]:
target = 'y'

# Preprocess the data
X_train, X_test = train[features], test[features]
y_train = train[target]

# Create the PseudoLabeler with XGBRegressor as the base regressor
model = PseudoLabel(
    XGBRegressor(nthread=1),
    test,
    features,
    target
    )

model.fit(X_train, y_train)
model.predict(X_train)

array([ 117.26475525,   91.7115097 ,   76.58120728, ...,  110.61658478,
         91.74913025,   94.02607727], dtype=float32)

## Conclusion

Pseudo-labeling allows us to utilize unlabeled data while training machine learning models. This sounds like a powerful technique, and yes, it more often than not increases the performance of our models. However, it can be difficult to tune and to make it work properly, and even when it works, it gives only a slight performance boost. In competitions such as Kaggle, I believe that this technique can be useful, because, usually, even a slight increase in score can give you a boost on the leaderboard. Still, I would think twice before using this in a production environment as it seems to introduce additional complexity without a big increase in performance, and that might not necessarily be what you want.