In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
import numpy as np
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from tqdm import tqdm

In [2]:
train = pd.read_csv("../input/train.csv", index_col=0)
test = pd.read_csv("../input/test.csv", index_col=0)
submission = pd.read_csv("../input/sample_submission.csv", index_col=0)

category = "wheezy-copper-turtle-magic"


In [12]:
################
# QDA
################
# Config
features = [column for column in train.columns if column not in ["id", "target", category]]
probabilities = pd.Series(np.zeros(len(train)), index=train.index)
test_predictions = pd.Series(np.zeros(len(test)), index=test.index)

# Loop through wheezy-copper-turtle-magic
for i in tqdm(range(512)):
    # Subset train and test
    # where wheezy == i and features only
    train_ = train.loc[train[category] == i, :]
    test_ = test.loc[test[category] == i, :]
    
    # VarianceThreshold
    from sklearn.feature_selection import VarianceThreshold
    feature_selector = VarianceThreshold(threshold = 1.5).fit(train_.loc[:, features])
    train_2 = feature_selector.transform(train_.loc[:, features])
    test_2 = feature_selector.transform(test_.loc[:, features])
    
    # At this moment train_ and test_ contain all columns and only samples from wheezy == i
    # and train_2 and test_2 contain only selected columns and samples from wheezy == i    
    
    # Stratified k-fold
    skf = sk.model_selection.StratifiedKFold(n_splits=10, random_state=26, shuffle=True)
    
    for split_train_index, split_test_index in skf.split(train_2, train_["target"]):
        # QDA
        qda = QuadraticDiscriminantAnalysis()
        qda.fit(train_2[split_train_index, :], train_["target"][split_train_index])
        
        # Getting probabilities of the test part of the split
        split_probabilities = qda.predict_proba(train_2[split_test_index, :])[:, 1]
        # Saving predictions
        probabilities[train_.index[split_test_index]] += split_probabilities
        test_predictions[test_.index] += qda.predict_proba(test_2)[:, 1] / skf.n_splits

print(sk.metrics.roc_auc_score(train["target"], probabilities))

100%|████████████████████████████████████████████████████████████████████████████████| 512/512 [01:20<00:00,  6.16it/s]


0.9647742563613684


In [77]:
###################
# Data augmentation
###################
# Psuedo-labeling
mask = test_predictions[np.logical_or(test_predictions <= test_predictions.quantile(0.25),
                                     test_predictions >= test_predictions.quantile(0.75))].index
train_only_psuedo = test.loc[mask, :]
train_only_psuedo = train_only_psuedo.assign(target=[1 if el > 0.5 else 0 for el in test_predictions[mask]])

# Check if indices match and columns match
assert all(test_predictions[mask].index == train_only_psuedo.index)
assert all(train.columns == train_only_psuedo.columns)

# Concatenate psudeo train with train
train_with_psuedo = pd.concat([train, train_only_psuedo])

In [None]:
# The idea here is motivated by others and is inspired
# by the nature of the dataset
# Since we know make_classification with y_flip introduces flips
# Let's reverse them

In [76]:
print(train_with_psuedo.columns)

Index(['muggy-smalt-axolotl-pembus', 'dorky-peach-sheepdog-ordinal',
       'slimy-seashell-cassowary-goose',
       'snazzy-harlequin-chicken-distraction', 'frumpy-smalt-mau-ordinal',
       'stealthy-beige-pinscher-golden', 'chummy-cream-tarantula-entropy',
       'hazy-emerald-cuttlefish-unsorted', 'nerdy-indigo-wolfhound-sorted',
       'leaky-amaranth-lizard-sorted',
       ...
       'wheezy-myrtle-mandrill-entropy', 'wiggy-lilac-lemming-sorted',
       'gloppy-cerise-snail-contributor', 'woozy-silver-havanese-gaussian',
       'jumpy-thistle-discus-sorted', 'muggy-turquoise-donkey-important',
       'blurry-buff-hyena-entropy', 'bluesy-chocolate-kudu-fepid',
       'gamy-white-monster-expert', 'target'],
      dtype='object', length=257)
