
# Maximal Representative Subsampling


<div class="alert alert- block alert-warning"> <b>Todo:</b> 

- Use unprundes trees

- Try linear svm with C param tuning

- next meeting: 5th september 11h.

</div>

In [189]:
from pathlib import Path
import os
%config IPCompleter.greedy=True

import matplotlib.pyplot as plt
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
np.seterr(divide = 'ignore')
import random

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

from sklearn import metrics
from scipy import stats

path = Path(os.getcwd()).parent

NameError: name 'warnings' is not defined

## MRS ALGORITHM

In [110]:
def temp_sample(softmax, temperature, drop):

    EPSILON = 10e-16 # to avoid taking the log of zero
    softmax = (np.array(softmax) + EPSILON).astype('float64')
    
    preds = np.log(softmax) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(drop, preds, 1)

    return probas[0]

In [111]:
def _bootstrap(df, n):
    train = df.sample(n, replace=True)
    tmp = df.index.isin(train.index)
    test = df[~tmp]
    return train, test

In [112]:
def svc_param_selection(X, y, nfolds):
    
    svc = LinearSVC(max_iter=1000)
    C = [0.001, 0.01, 0.1, 1, 10]
    param_grid = {'C': C}
    grid = GridSearchCV(estimator=svc, iid=True, param_grid=dict(C=C), cv=nfolds)
    grid.fit(X, y)
    grid.best_params_
    return grid.best_params_

In [186]:
def MRS(nonrep, rep, columns, temperature, n_drop, limit, ensemble_size, C=1.0):
    
    auc = []
    
    nonrep['label'] = 1
    rep['label'] = 0
    
    while (nonrep.shape[0] > limit):
    
        nonrep['__preds'] = 0
        rep['__preds'] = 0
    
        nonrep['__count'] = 0
        rep['__count'] = 0
    
        n = min(nonrep.shape[0], rep.shape[0])
        print(n, end=', ')
        
        for _ in range(ensemble_size):

            train_nonrep, test_nonrep = _bootstrap(nonrep, n)
            train_rep, test_rep = _bootstrap(rep, n)
            
            train = pd.concat([train_nonrep, train_rep], sort=False)
            test = pd.concat([test_nonrep, test_rep], sort=False)
            
            svm = LinearSVC(C=C, max_iter=1_000_000)
            clf = CalibratedClassifierCV(svm, cv=5)
            clf.fit(train[columns], train.label)
            
            test['__preds'] = [i+j[1] for i,j in zip(test['__preds'], clf.predict_proba(test[columns]))]
            test['__count'] = [i+1 for i in test['__count']]
            
            df = pd.concat([train.drop_duplicates(subset=train.columns, keep='first', inplace=False), test])
            
            nonrep = df[df.label == 1].copy(deep=True)
            rep = df[df.label == 0].copy(deep=True)
            
            del train_nonrep, test_nonrep, train_rep, test_rep, train, test
        
        oosample = df[df.__count!=0] #predicted at least once
        insample = df[df.__count==0] #only used to train
        oosample['__preds'] = [i/j for i,j in zip(oosample['__preds'], oosample['__count'])]

        auc.append((-n, metrics.roc_auc_score(oosample.label, oosample.__preds)))

        del oosample, insample
        
        drop = df[(df.__count!=0) & (df.label==1)]
        keep = df[(df.__count==0) | (df.label==0)]
        
        drop['removed'] = temp_sample(drop.__preds, temperature, n_drop) 
        
        drop = drop[drop.removed == 0]
        drop.drop('removed', axis=1, inplace=True)
        
        df = pd.concat([drop, keep], sort=True)
        df.reset_index(inplace=True, drop=True)
        del drop, keep
        
        nonrep = df[df.label==1].copy(deep=True)
        rep = df[df.label==0].copy(deep=True)
        
    return auc

### US National Census (Income) <a name="us"></a>

*About this Dataset*

**US Adult Census** (1994) relates income to social factors: 

- *age*: continuous.
- *workclass*: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
- *fnlwgt*: continuous.
- *education*: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
- *education-num*: continuous.
- *marital-status*: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
- *occupation*: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
- *relationship*: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
- *race*: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
- *sex*: Female, Male.
- *capital-gain*: continuous.
- *capital-loss*: continuous.
- *hours-per-week*: continuous.
- *native-country*: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

Each row is labelled as either having a salary greater than ">50K" or "<=50K".

Note: This Dataset was obtained from the UCI repository, it can be found on

https://archive.ics.uci.edu/ml/datasets/census+income, http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/

In [None]:
columns = ['Age','Workclass','fnlgwt','Education','Education Num','Marital Status',
           'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
           'Hours/Week','Country','Above/Below 50K']

train = pd.read_csv(os.path.join(path, 'data/census_income/adult.data'), names=columns)
test = pd.read_csv(os.path.join(path, 'data/census_income/adult.test'), names=columns)
test = test.iloc[1:]

df = pd.concat([train, test]).copy(deep=True)

del train, test

df.replace(' >50K.', ' >50K', inplace=True)
df.replace(' <=50K.', ' <=50K', inplace=True)

df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

ctg = ['Workclass', 'Sex', 'Education', 'Marital Status', 
       'Occupation', 'Relationship', 'Race', 'Country']
for c in ctg:
    df = pd.concat([df, pd.get_dummies(df[c], 
                                       prefix=c,
                                       dummy_na=False)], axis=1).drop([c],axis=1)

'''Rep: <=50K    37155 ;; >50K     11687'''

df_high = df[df['Above/Below 50K'] == " >50K"].copy(deep=True)
df_low = df[df['Above/Below 50K'] == " <=50K"].copy(deep=True)

df_low = df_low.reindex(np.random.permutation(df_low.index))
df_high = df_high.reindex(np.random.permutation(df_high.index))

rep = pd.concat([df_low.head(5000).copy(deep=True),
                 df_high.head(5000).copy(deep=True)], sort=True)

nonrep = pd.concat([df_low.tail(5000).copy(deep=True),
                    df_high.tail(5000).copy(deep=True)], sort=True)

print('Rep: \n', rep['Above/Below 50K'].value_counts(), '\n')
print('Nonrep: \n', nonrep['Above/Below 50K'].value_counts())

nonrep['label'] = 1
rep['label'] = 0

del df, df_low, df_high

us = pd.concat([nonrep, rep], sort=True)

us_columns = list(us.columns)
meta = ['label', 'Above/Below 50K', 'index', 'bootstrap']
for m in meta:
    if m in us_columns:
        us_columns.remove(m)

us.reset_index(drop=True, inplace=True)
us.head()

In [None]:
allensbach = pd.read_csv(os.path.join(path, 'data/allensbach_mrs.csv'))

allensbach.drop(['Unnamed: 0'], axis=1, inplace=True)

allensbach_columns = ['BRS1', 'BRS2', 'BRS3', 'BRS4', 'BRS5', 'BRS6', 
                      'Berufsgruppe', 'Erwerbstätigkeit', 'Geschlecht',
                      'Optimismus', 'Pessimismus', 'Schulabschluss', 'woechentlicheArbeitszeit']
allensbach.head()

In [181]:
gesis = pd.read_csv(os.path.join(path, 'data/gesis_processed.csv'), engine='python')
gbs = pd.read_csv(os.path.join(path, 'data/gbs_processed.csv'), engine='python')

gesis = gesis[gesis.Wahlteilnahme != 0.5] #drop gesis where wahlteilnahme unknown

absicht = {3:0.5, 2:0, 1:0}
gbs = gbs.replace({'Wahlabsicht': absicht})
absicht2 = {5:1, 4:1}
gbs = gbs.replace({'Wahlabsicht': absicht2})

gesis['label'] = 0
gbs['label'] = 1

cols = ['Geschlecht', 'Geburtsjahr', 'Nationalitaet', 'Geburtsland', 'Nettoeinkommen Selbst',
        'Nettoeinkommen Haushalt', 'Personen im Haushalt', 'Berufsgruppe',
       'Resilienz', 'Wahlteilnahme', 'Wahlabsicht', 'Hoechster Bildungsabschluss',
       'Familienstand', 'Berufliche Ausbildung', 'Erwerbstaetigkeit'] 
        
de = 'Aktiv', 'Schlechter Schlaf', 'Leben genießen', 'Alles anstrengend'

scaler = StandardScaler()
scaler.fit(pd.concat([gesis, gbs], sort = False)[cols].values)
# is it reasonable to fit scaler on rep and nonrep data?
gesis[cols] = scaler.transform(gesis[cols]) 
gbs[cols] = scaler.transform(gbs[cols]) 

'''
Desinteresse Politiker
'Wach', 'Zurueckhaltend', 'Zufriedenheit Wahlergebnis',
'leicht Vertrauen', 'Faulheit', 'Entspannt',
'wenig kuenstlerisches Interesse', 'Gesellig',
'Andere kritisieren',
'Schlechter Schlaf', 'Leben genießen',
'Zu Nichts aufraffen', 'Alles anstrengend', '
'Gruendlich', 'Nervoes', 'Phantasievoll', 'Optimismus Zukunft
'''



"\nDesinteresse Politiker\n       'Wach', 'Zurueckhaltend', '', 'Zufriedenheit Wahlergebnis',\n       'leicht Vertrauen', 'Faulheit', 'Entspannt',\n       'wenig kuenstlerisches Interesse', 'Gesellig',\n       'Andere kritisieren',\n'Schlechter Schlaf', 'Leben genießen',\n       'Zu Nichts aufraffen', 'Alles anstrengend', '\n       'Gruendlich', 'Nervoes', 'Phantasievoll', 'Optimismus Zukunft']"

In [None]:
for C in [0.001, 0.1, 0.5, 1, 5, 10, 20]:
    for T in [0.15]:

        auc = MRS(nonrep=gbs, rep=gesis, columns=cols,
                  temperature=T, ensemble_size=10, n_drop=100, limit=10, C=C)

        plt.plot([a[0] for a in auc], [a[1] for a in auc], 
                 label='C=' + str(C) + ", T=" + str(T))
        plt.plot([a[0] for a in auc], len(auc)*[0.5], linestyle='--')
        plt.xlabel("removed instances")
        plt.legend(loc='lower left')
        plt.grid()
        
plt.figure(figsize=(12, 8), dpi=240, facecolor='w', edgecolor='k')
plt.show()

579, 533, 477, 432, 389, 343, 