### Part 1

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

data = pd.read_csv('investigation_train_large_checked.csv', header=0 )

target = data['checked']
features = data.drop( columns=[ 'Ja', 'Nee', 'checked' ])

print( target.value_counts(True) )
print( target.shape )

checked
False    0.849969
True     0.150031
Name: proportion, dtype: float64
(130000,)


In [2]:
def get_problematic_columns( data ):
    psychological_features = []
    medical_features = [ 'belemmering_hist_verslavingsproblematiek' ]
    racial_features = ['ontheffing_reden_hist_sociale_gronden']
    subjective_features = [ 'competentie_ethisch_en_integer_handelen', 'competentie_gedrevenheid_en_ambitie_tonen', 'competentie_met_druk_en_tegenslag_omgaan', 'competentie_omgaan_met_verandering_en_aanpassen',
                            'persoonlijke_eigenschappen_uitstroom_verw_vlgs_km', 'persoonlijke_eigenschappen_uitstroom_verw_vlgs_klant', 'afspraak_aantal_woorden', 'afspraak_laatstejaar_aantal_woorden',
                            'competentie_other', 'competentie_overtuigen_en_be√Ønvloeden'
                          ]
    age_features = ['persoon_leeftijd_bij_onderzoek']
    gender_features = ['persoon_geslacht_vrouw']
    relationship_features = []
    irrelevant_features = [ 'persoonlijke_eigenschappen_hobbies_sport' ]
    
    for col in data.columns:
        if 'relatie' in col:
            relationship_features.append( col )
        elif 'persoonlijke' in col:
            if '_nl_' in col or 'taal' in col:
                racial_features.append(col)
            elif '_opm' in col:
                subjective_features.append(col)
        elif 'adres_recenst' in col or 'sociaal' in col or 'taal' in col:
            racial_features.append(col)
        elif 'medische' in col or 'lichamelijke' in col:
            medical_features.append(col)
        elif 'psychische' in col:
            psychological_features.append(col)

    return {
            'psychological': psychological_features,
            'medical': medical_features,
            'racial': racial_features,
            'subjective': subjective_features,
            'gender': gender_features,
            'relationship': relationship_features,
            'age': age_features,
            'irrelevant': irrelevant_features
           }

### Part 2

In [3]:
def group_subset( data, column_set ):
    pca = PCA( n_components=1 )
    subset = pca.fit_transform( data[column_set] )
    return subset

def n_wise_partition( feature, n_partitions=2, thresholds=None ):
    feature = feature.copy()
    partitions = []
    if thresholds is None:
        mn = feature.min()
        mx = feature.max()
        step = (mx-mn)/n_partitions
        thresholds = [ i for i in np.arange( mn, mx, step ) ]
        thresholds = thresholds[1:]
    else:
        assert n_partitions == len(thresholds)+1

    for i in range(n_partitions-1):
        partitions.append( feature <= thresholds[i] )
    partitions.append( feature > thresholds[-1] )

    return partitions

def shuffle_columns( data, column_set ):
    data = data.copy()
    shuffled = data[column_set].sample(frac=1).reset_index(drop=True)
    data[column_set] = shuffled
    return data

In [4]:
problem_cols = get_problematic_columns( features )
partition_sizes = {
    'psychological': 2, # well, unwell
    'medical': 2, # well, unwell
    'racial': 4, # Germanic language native, Romance native, PIE native, Non-PIE native
    'subjective': 3, # Low, Mid, High opinion
    'gender': 2, # Male, Female
    'relationship': 3, # Small average, large social circle/family
    'age': 3, # Young Adult, Adult, Senior
    'irrelevant': 2 # Only for sports hobbyists, yes/no.
}

n_problem_features = 0
partitions = {}
for problem_type in problem_cols:
    grouped_subset = group_subset( features, problem_cols[problem_type] )
    group_partitions = n_wise_partition( grouped_subset, partition_sizes[problem_type] )    
    partitions[problem_type] = group_partitions
    n_problem_features += len( problem_cols[problem_type] )


### Part 3

In [5]:
import torch
import torch.nn as nn
from collections import OrderedDict
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
class Model:
    def __init__( self, arch, n_in, n_out, loss ):
        self.classifier = arch
        self.loss = loss
        self.optimizer = torch.optim.Adam( arch.parameters(), lr=1e-3 )

    def forward( self, X ):
        return self.classifier( X )

    def backward( self, y_pred ):
        loss = self.loss( y_pred, y )
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def train( self, X, y, epochs=1000 ):
        X_adv = X[adversarial_cols]
        for _ in range(epochs):
            y_pred = self.forward( X )
            self.backward( y_pred )
        
        return y_pred


class AdversarialModel:
    def __init__( self, clf, adv, loss, optimizer, l=0.3, good=True ):
        self.classifier = clf
        self.adversary = adv
        self.loss = loss
        self.optimizer = torch.optim.Adam( adv.parameters(), lr=1e-3 )
        self.l = l
        self.good = good
        
    def train( self, X, X_prob, y, epochs=1000 ):
        for _ in range(epochs):
            y_pred = self.classifier.forward( X )
            X_prob_pred = self.adversary.forward( y_pred )

            if self.good:
                loss = self.classifier.loss( y_pred, y ) - self.l * self.adversary.loss( X_prob_pred, X_prob )
            else:
                loss = self.classifier.loss( y_pred, y ) + self.l * self.adversary.loss( X_prob_pred, X_prob )

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            

In [7]:
n_samples, n_features = features.shape

cross_entropy = nn.CrossEntropyLoss()
adam = 

clf = Model( arch = nn.Sequential(
            OrderedDict([
                ( 'linear1', nn.Linear( n_features, 100 ) ),
                ( 'activation1', nn.ReLU() ),
                ( 'linear2', nn.Linear( 100, 25 ) ),
                ( 'activation2', nn.ReLU()),
                ( 'linear3', nn.Linear( 25, 10 ) ),
                ( 'activation3', nn.ReLU()),
                ( 'linear4', nn.Linear( 10, n_out ) )
                #( 'activation4', nn.Sigmoid() )
            ])
        ).to(device),
             n_in=n_features, n_out=2,
             loss=nn.CrossEntropyLoss(),
             optimizer = torch.optim.Adam( model.parameters(), lr=1e-3 )
           )

adv = Model( arch = nn.Sequential(
            OrderedDict([
                ( 'linear1', nn.Linear( n_features, 100 ) ),
                ( 'activation1', nn.ReLU() ),
                ( 'linear2', nn.Linear( 100, 50 ) ),
                ( 'activation3', nn.ReLU()),
                ( 'linear4', nn.Linear( 50, problem_feature_count ) )
                #( 'activation4', nn.Sigmoid() )
            ])
        ).to(device),
             n_in=n_features, n_out=2,
             loss=nn.CrossEntropyLoss(),
             optimizer = torch.optim.Adam( model.parameters(), lr=1e-3 )
           )

good_model = AdversarialModel( clf=clf, adv=adv, loss_adversary=nn.CrossEntropyLoss(), optimizer=adam, good=True )
bad_model = AdversarialModel( clf=clf, adv=adv, loss_adversary=nn.CrossEntropyLoss(), optimizer=adam, good=False )

NameError: name 'model' is not defined

In [None]:
def train_test_model( model, X, y, epochs=1000 ):
    X_train, X_test, y_train, y_test = train_test_split( X.values, y.values, test_size=0.2 )
    X_train = torch.tensor(X_train, dtype=torch.float).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    X_test = torch.tensor(X_test, dtype=torch.float).to(device)
    y_test = torch.tensor(y_test, dtype=torch.long).to(device)
    
    model.train( X_train, y_train )
    
    model.arch.eval()
    with torch.no_grad():
        y_pred = torch.argmax( model.forward(X_train), dim=1 )
        train_accuracy = 100 * (y_pred==y_train).float().mean().to("cpu")
        
        y_pred = torch.argmax( model.forward(X_test), dim=1 )
        test_accuracy = 100 * (y_pred==y_test).float().mean().to("cpu")
    
    return model, train_accuracy, test_accuracy

def test_partitions( model, X, y, partitions, accuracy_threshold=90, pass_threshold=0.05 ):
    passes, idx = 0, 0
    checked_per_partition = np.empty( len(partitions) )
    accuracies = np.empty( len(partitions) )
    
    for partition in partitions:
        X_part = X.iloc[partition]
        y_part = y.iloc[partition]
        
        model, train_acc, test_acc = train_test_model( model=model, X=X_part, y=y_part )
        
        X_part = torch.tensor(X_part.values, dtype=torch.float).to(device)
        y_part = torch.tensor(y_part.values, dtype=torch.long).to(device)
        
        y_pred = torch.argmax( model.forward(X_part), dim=1 )
        checked_count = ( y_pred == 1 ).float().mean().to("cpu").numpy()
        checked_per_partition[idx] = checked_count
        accuracies[idx] = test_acc
        idx += 1

    checked_mean = checked_per_partition.mean()
    for i in range(len(checked_per_partition)):
        if accuracies[i] < accuracy_threshold:
            continue
        if checked_per_partition[i]/checked_mean - 1 < pass_threshold:
            passes += 1

    return passes
        
        

In [None]:
model, train_acc, test_acc = train_test_model( model=good_model, X=features, y=target )
print( f"Train acc: {train_acc}")
print( f"Test acc: {test_acc}")

In [None]:
for problem_type in problem_cols:
    passes = test_partitions( model, features, target, partitions[problem_type], 85 )
    print( f"Passes for {problem_type}: {passes}/{partition_sizes[problem_type]}" )

### Train once and partition test set