### This notebook actually calculates a representative subsample of GBS. ROC Curves are therefore combined with increasing brightness to demonstrate the progress.

In [5]:
from pathlib import Path
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

path = Path(os.getcwd()).parent

%run utils.py

In [6]:
gbs = pd.read_csv(os.path.join(path, 'data/gbs_processed.csv'), encoding = "ISO-8859-1", delimiter = ',')
gesis = pd.read_csv(os.path.join(path, 'data/gesis_processed.csv'), encoding = "ISO-8859-1", delimiter = ',')

gbs = gbs.fillna(gbs.median())
gesis = gesis.fillna(gesis.median())

gbs['Umfrage'] = 1
gesis['Umfrage'] = 0

In [7]:
gesis.drop(columns = ['GESIS-CODE'], axis=1, inplace = True)
gbs.drop(columns = ['GBS-CODE', 'Gruppe'], axis=1, inplace = True)

drop = ['Personen im Haushalt', 'Druck', 'Optimismus Zukunft', 'Geburtsland', 'Nationalitaet',
       'Familienstand', 'Hoechster Bildungsabschluss', 'Berufliche Ausbildung',
       'Berufsgruppe', 'Leben genießen', 'Zu Nichts aufraffen',
       'Alles anstrengend', 'Zufriedenheit Leben', 'Aktiv', 'Verärgert',
       'Zufriedenheit Wahlergebnis', 'Resilienz', 'Wach', 'Nervös', 'Ängstlich', 'Zurueckhaltend',
       'Nettoeinkommen Selbst', 'Nettoeinkommen Haushalt', 'Schlechter Schlaf',
       'Wahlabsicht', 'Desinteresse Politiker', 'Erwerbstaetigkeit', 'Geburtsjahr', 'Wahlteilnahme']

gbs.drop(columns=drop, axis=1, inplace=True)
gesis.drop(columns=drop, axis=1, inplace=True)

non_rep = gbs.copy(deep=True)
rep = gesis.copy(deep=True)

rep['label'] = 'rep'
non_rep['label'] = 'nonrep'

In [10]:
data = pd.concat([rep, non_rep], sort=True).copy(deep=True)
data.reset_index(drop=True, inplace=True)
data['probs'] = len(data.label)*[0]

temperature = 1 
max_drop = 2
limit = 10

ks = []
auc = []

while (len(data[data.label == 'nonrep']) > max_drop and
       len(data.label) > limit):

    rf = RandomForestClassifier(n_estimators=100, 
                                bootstrap=True,
                                max_depth=5,
                                oob_score=True)
    
    probs = cross_val_predict(rf,
                              data.drop(['label', 'probs'], axis=1),
                              data['label'], 
                              cv=3,
                              method='predict_proba')
    
    preds = cross_val_predict(rf,
                              data.drop(['label', 'probs'], axis=1),
                              data['label'], 
                              cv=3)
    
    if preds[0] == 'nonrep' and round(probs[0][0], 0) == 1:
        data['probs'] = [p[0] for p in probs]
    else:
        data['probs'] = [p[1] for p in probs]
    
    drop_id = []
    for _ in range(max_drop):
        softmax = sample(data[data.label == 'nonrep']['probs'], temperature)
        drop = data[data.label == 'nonrep'].iloc[[np.argmax(softmax)]].index[0]
        drop_id.append(drop)
        
    data.drop(data.index[drop_id], inplace=True)
    data.reset_index(drop=True, inplace=True)

    # EVALUATION
    ks.append(kstest(data.probs, 'uniform'))
    auc.append(metrics.roc_auc_score([1 if k == 'nonrep' else 0 for k in data.label], data.probs))
    print('auc =', metrics.roc_auc_score([1 if k == 'nonrep' else 0 for k in data.label], data.probs))
    print('length of current dataframe:', len(data.label))

Unnamed: 0,Geschlecht,leicht Vertrauen,Faulheit,Entspannt,wenig kuenstlerisches Interesse,Gesellig,Andere kritisieren,Gruendlich,Nervoes,Phantasievoll,Umfrage
0,0.0,4.0,2.0,2.0,1.0,5.0,4.0,4.0,3.0,5.0,1
1,1.0,3.0,2.0,2.0,1.0,4.0,2.0,5.0,1.0,5.0,1
2,1.0,4.0,4.0,3.0,4.0,2.0,2.0,4.0,3.0,4.0,1
3,1.0,4.0,2.0,4.0,4.0,2.0,3.0,4.0,2.0,3.0,1
4,1.0,5.0,2.0,4.0,2.0,4.0,2.0,5.0,2.0,5.0,1
5,1.0,2.0,1.0,1.0,2.0,4.0,2.0,5.0,5.0,5.0,1
6,0.0,4.0,4.0,4.0,5.0,2.0,3.0,4.0,2.0,1.0,1
7,0.0,4.0,1.0,4.0,2.0,2.0,3.0,5.0,4.0,4.0,1
8,1.0,4.0,1.0,4.0,2.0,3.0,2.0,4.0,2.0,3.0,1
9,0.0,4.0,2.0,4.0,4.0,3.0,3.0,4.0,2.0,3.0,1
