In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
import numpy as np
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from tqdm import tqdm

In [2]:
train = pd.read_csv("../input/train.csv", index_col=0)
test = pd.read_csv("../input/test.csv", index_col=0)
submission = pd.read_csv("../input/sample_submission.csv", index_col=0)

category = "wheezy-copper-turtle-magic"


In [10]:
################
# QDA
################
# Config
features = [column for column in train.columns if column not in ["id", "target", category]]
probabilities = pd.Series(np.zeros(len(train)), index=train.index)
test_predictions = pd.Series(np.zeros(len(test)), index=train.index)

# Loop through wheezy-copper-turtle-magic
for i in tqdm(range(512)):
    # Subset train and test
    # where wheezy == i and features only
    train_ = train.loc[train[category] == i, :]
    test_ = test.loc[test[category] == i, :]
    
    # VarianceThreshold
    from sklearn.feature_selection import VarianceThreshold
    feature_selector = VarianceThreshold(threshold = 1.5).fit(train_.loc[:, features])
    train_2 = feature_selector.transform(train_.loc[:, features])
    test_2 = feature_selector.transform(test_.loc[:, features])
    
    # At this moment train_ and test_ contain all columns and only samples from wheezy == i
    # and train_2 and test_2 contain only selected columns and samples from wheezy == i    
    
    # Stratified k-fold
    skf = sk.model_selection.StratifiedKFold(n_splits=10, random_state=26, shuffle=True)
    
    for split_train_index, split_test_index in skf.split(train_2, train_["target"]):
        # QDA
        qda = QuadraticDiscriminantAnalysis()
        qda.fit(train_2[split_train_index, :], train_["target"][split_train_index])
        
        # Getting probabilities of the test part of the split
        split_probabilities = qda.predict_proba(train_2[split_test_index, :])[:, 1]
        # Saving predictions
        probabilities[train_.index[split_test_index]] += split_probabilities

print(sk.metrics.roc_auc_score(train["target"], probabilities))