In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import roc_auc_score
from sklearn.covariance import GraphicalLasso
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC

In [2]:
train = pd.read_csv("G:/DS/instant-gratification/train.csv")
test = pd.read_csv("G:/DS/instant-gratification/test.csv")

In [3]:
def get_mean_cov(x,y):
    model = GraphicalLasso()
    ones = (y==1).astype(bool)
    x2 = x[ones]
    model.fit(x2)
    p1 = model.precision_
    m1 = model.location_
    
    onesb = (y==0).astype(bool)
    x2b = x[onesb]
    model.fit(x2b)
    p2 = model.precision_
    m2 = model.location_
    
    ms = np.stack([m1,m2])
    ps = np.stack([p1,p2])
    return ms,ps

In [4]:
cols = [c for c in train.columns if c not in ["id","target","wheezy-copper-turtle-magic"]]
trains = np.zeros(len(train))
preds = np.zeros(len(test))
trains1 = np.zeros(len(train))
preds1 = np.zeros(len(test))
trains2 = np.zeros(len(train))
preds2 = np.zeros(len(test))
for i in range(512):
    train1 = train[train["wheezy-copper-turtle-magic"]==i]
    test1 = test[test["wheezy-copper-turtle-magic"]==i]
    train1_index = train1.index
    test1_index = test1.index
    train1.reset_index(drop=True,inplace=True)
    
    features = VarianceThreshold(threshold=2.5)
    features.fit(train1[cols])
    train2 = features.transform(train1[cols])
    test2 = features.transform(test1[cols])
    
    # K-Fold
    val = StratifiedKFold(n_splits= 15, random_state=42, shuffle=True)
    for train_index, test_index in val.split(train2,train1["target"]):
        qda = QuadraticDiscriminantAnalysis(reg_param=0.45)
        qda.fit(train2[train_index,:],train1.loc[train_index]["target"])
        trains[train1_index[test_index]] = qda.predict_proba(train2[test_index,:])[:,1]
        preds[test1_index] += qda.predict_proba(test2)[:,1] / val.n_splits
        
auc = roc_auc_score(train['target'],trains)
print('QDA scores CV =',round(auc,5))

QDA scores CV = 0.96569


In [5]:
test["target"] = preds
trains = np.zeros(len(train))
preds = np.zeros(len(test))
for i in range(512):
    train1 = train[train["wheezy-copper-turtle-magic"] == i]
    train2 = train1.copy()
    train1_index = train1.index
    test1 = test[test["wheezy-copper-turtle-magic"] == i]
    
    test1_1 = test1[(test1["target"]<=0.01)|(test1["target"]>=0.99)].copy()
    test1_1.loc[test1_1["target"] >= 0.5,"target"] = 1
    test1_1.loc[test1_1["target"] < 0.5,"target"] = 0
    train2 = pd.concat([train2,test1_1])
    train2.reset_index(drop=True,inplace=True)
    
    features = VarianceThreshold(threshold=2.5).fit(train2[cols]) 
    train3 = features.transform(train2[cols])
    train3_1 = features.transform(train1[cols])
    test3 = features.transform(test1[cols])
    
    val = StratifiedKFold(n_splits= 14, random_state=41, shuffle=True)
    for train_index, test_index in val.split(train3,train2["target"]):
        # QDA 
        test_index1 = test_index[test_index < len(train3_1)]
        qda = QuadraticDiscriminantAnalysis(reg_param=0.1)
        qda.fit(train3[train_index,:],train2.loc[train_index]["target"])
        trains[train1_index[test_index1]] = qda.predict_proba(train3_1[test_index1,:])[:,1]
        preds[test1.index] += qda.predict_proba(test3)[:,1] / val.n_splits
        
auc = roc_auc_score(train['target'],trains)
print('QDA scores CV =',round(auc,5))


QDA scores CV = 0.9704


In [6]:
oof = np.zeros(len(train))
preds2 = np.zeros(len(test))
for i in range(512):
    train2 = train[train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)
    
    sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])
    
    skf = StratifiedKFold(n_splits= 14, random_state=41, shuffle=True)
    for train_index, test_index in val.split(train3,train2["target"]):
        # GMM
        ms, ps = get_mean_cov(train3[train_index,:],train2.loc[train_index]['target'].values)
        
        gm = GaussianMixture(n_components=2, init_params='random', covariance_type='full', tol=0.001,reg_covar=0.001, max_iter=100, n_init=1,means_init=ms, precisions_init=ps)
        gm.fit(np.concatenate([train3[train_index,:],test3],axis = 0))
        oof[idx1[test_index]] = gm.predict_proba(train3[test_index,:])[:,0]
        preds2[idx2] += gm.predict_proba(test3)[:,0] / skf.n_splits
        
auc2 = roc_auc_score(train['target'],oof)
print('GMM scores CV =',round(auc2,5))



GMM scores CV = 0.96891


In [10]:
final = 1/2*trains + 1/2*oof1
auc4 = roc_auc_score(train['target'],final)
print('Ensenmble scores CV =',round(auc4,5))

Ensenmble scores CV = 0.96862
