
# Maximal Representative Subsampling


In [1]:
from pathlib import Path
import os

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn import metrics
from scipy import stats
from scipy.stats import uniform
from sklearn.metrics import roc_curve  

path = Path(os.getcwd()).parent

## MRS ALGORITHM

In [2]:
def temp_sample(softmax, temperature):

    EPSILON = 10e-16
    softmax = (np.array(softmax) + EPSILON).astype('float64')
    
    preds = np.log(softmax) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)

    return probas[0]

def plot_roc(fpr, tpr):
    
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
    
def get_size_of_minority_class(df, label):
    return min(len(df[df[label]==1]), len(df[df[label]==0]))

In [6]:
def MRS(nonrep: pd.DataFrame, 
        rep: pd.DataFrame, 
        columns,
        temp,  
        drop, 
        ensemble_size):
    
    AUC = []
    
    nonrep['label'] = 1
    rep['label'] = 0
    
    df = pd.concat([nonrep, rep], sort=True)
    
    for drop_count in range(drop):
        print(drop_count)
    
        df['preds'] = 0
        n = get_size_of_minority_class(df, 'label')
        df.reset_index(drop=True, inplace=True)
        
        for train_index, test_index in StratifiedKFold(n_splits=2).split(df, df.label):
            
            train, test = df.loc[train_index], df.loc[test_index]
            
            grid = GridSearchCV(LinearSVC(max_iter=1000000),
                                param_grid={'C': [0.01, 0.02, 0.05, 0.1]}, 
                                cv=2, iid=True)
            grid.fit(train[columns], train.label)

            for _ in range(ensemble_size):

                bootstrap = train.sample(n=len(train), random_state=1, replace=True)

                linear_svc = LinearSVC(C=grid.best_estimator_.C, max_iter=1000000)
                clf = CalibratedClassifierCV(linear_svc, method='sigmoid', cv=2)
                clf.fit(bootstrap[columns], bootstrap.label)

                df.loc[test_index]['preds'] = [(a[0]/ensemble_size)+b for a,b in zip(clf.predict_proba(test[columns]), 
                                                                                     test.preds)]
                print(df)
        AUC.append(metrics.roc_auc_score(test.label, test.preds))
        
        df['removed'] = temp_sample(df.preds, temp)
        df = df[df['removed'] == 0]
        
    return AUC

In [None]:

auc = MRS(nonrep=gbs, rep=gesis, columns=cols, temp=0.25, ensemble_size=2, drop=5)
print(auc)

### US National Census (Income) <a name="us"></a>

*About this Dataset*

**US Adult Census** (1994) relates income to social factors: 

- *age*: continuous.
- *workclass*: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
- *fnlwgt*: continuous.
- *education*: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
- *education-num*: continuous.
- *marital-status*: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
- *occupation*: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
- *relationship*: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
- *race*: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
- *sex*: Female, Male.
- *capital-gain*: continuous.
- *capital-loss*: continuous.
- *hours-per-week*: continuous.
- *native-country*: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

Each row is labelled as either having a salary greater than ">50K" or "<=50K".

Note: This Dataset was obtained from the UCI repository, it can be found on

https://archive.ics.uci.edu/ml/datasets/census+income, http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/

In [4]:
columns = ['Age','Workclass','fnlgwt','Education','Education Num','Marital Status',
           'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
           'Hours/Week','Country','Above/Below 50K']

train = pd.read_csv(os.path.join(path, 'data/census_income/adult.data'), names=columns)
test = pd.read_csv(os.path.join(path, 'data/census_income/adult.test'), names=columns)
test = test.iloc[1:]

df = pd.concat([train, test]).copy(deep=True)

del train, test

df.replace(' >50K.', ' >50K', inplace=True)
df.replace(' <=50K.', ' <=50K', inplace=True)

df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

ctg = ['Workclass', 'Sex', 'Education', 'Marital Status', 
       'Occupation', 'Relationship', 'Race', 'Country']
for c in ctg:
    df = pd.concat([df, pd.get_dummies(df[c], 
                                       prefix=c,
                                       dummy_na=False)], axis=1).drop([c],axis=1)

'''Rep: <=50K    37155 ;; >50K     11687'''

df_high = df[df['Above/Below 50K'] == " >50K"].copy(deep=True)
df_low = df[df['Above/Below 50K'] == " <=50K"].copy(deep=True)

df_low = df_low.reindex(np.random.permutation(df_low.index))
df_high = df_high.reindex(np.random.permutation(df_high.index))

rep = pd.concat([df_low.head(200).copy(deep=True),
                 df_high.head(200).copy(deep=True)], sort=True)

nonrep = pd.concat([df_low.tail(200).copy(deep=True),
                    df_high.tail(200).copy(deep=True)], sort=True)

print('Rep: \n', rep['Above/Below 50K'].value_counts(), '\n')
print('Nonrep: \n', nonrep['Above/Below 50K'].value_counts())

nonrep['label'] = 1
rep['label'] = 0

del df, df_low, df_high

us = pd.concat([nonrep, rep], sort=True)

us_columns = list(us.columns)
meta = ['label', 'Above/Below 50K', 'index', 'bootstrap']
for m in meta:
    if m in us_columns:
        us_columns.remove(m)

us.reset_index(drop=True, inplace=True)
us.head()

Rep: 
  <=50K    200
 >50K     200
Name: Above/Below 50K, dtype: int64 

Nonrep: 
  <=50K    200
 >50K     200
Name: Above/Below 50K, dtype: int64


Unnamed: 0,Above/Below 50K,Age,Capital Gain,Capital Loss,Country_ ?,Country_ Cambodia,Country_ Canada,Country_ China,Country_ Columbia,Country_ Cuba,...,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,fnlgwt,label
0,<=50K,38,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,267540.0,1
1,<=50K,46,0.0,0.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,125492.0,1
2,<=50K,23,0.0,0.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,93977.0,1
3,<=50K,39,0.0,0.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,46395.0,1
4,<=50K,58,0.0,0.0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,101480.0,1


In [None]:
us_rep = us[us['label'] == 0].copy()
us_nonrep = us[us['label'] == 1].copy()

auc = MRS(nonrep=us_nonrep, rep=us_rep, columns=us_columns, temp=0.25, ensemble_size=5, drop=10)
print(auc)

0




    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           



    Above/Below 50K Age  Capital Gain  Capital Loss  Country_ ?  \
0             <=50K  38           0.0           0.0           0   
1             <=50K  46           0.0           0.0           0   
2             <=50K  23           0.0           0.0           0   
3             <=50K  39           0.0           0.0           0   
4             <=50K  58           0.0           0.0           0   
5             <=50K  31           0.0           0.0           0   
6             <=50K  26           0.0           0.0           0   
7             <=50K  64           0.0           0.0           0   
8             <=50K  41           0.0           0.0           0   
9             <=50K  32           0.0           0.0           0   
10            <=50K  19           0.0           0.0           0   
11            <=50K  46           0.0           0.0           0   
12            <=50K  40           0.0           0.0           0   
13            <=50K  51           0.0           0.0           

In [None]:
allensbach = pd.read_csv(os.path.join(path, 'data/allensbach_mrs.csv'))

allensbach.drop(['Unnamed: 0'], axis=1, inplace=True)

allensbach_columns = ['BRS1', 'BRS2', 'BRS3', 'BRS4', 'BRS5', 'BRS6', 
                      'Berufsgruppe', 'Erwerbstätigkeit', 'Geschlecht',
                      'Optimismus', 'Pessimismus', 'Schulabschluss', 'woechentlicheArbeitszeit']
allensbach.head()

In [27]:
gesis = pd.read_csv(os.path.join(path, 'data/gesis_processed.csv'), engine='python')
gbs = pd.read_csv(os.path.join(path, 'data/gbs_processed.csv'), engine='python')

gesis = gesis[gesis.Wahlteilnahme != 0.5] #drop gesis where wahlteilnahme unknown

absicht = {3:0.5, 2:0, 1:0}
gbs = gbs.replace({'Wahlabsicht': absicht})
absicht2 = {5:1, 4:1}
gbs = gbs.replace({'Wahlabsicht': absicht2})

gesis['label'] = 0
gbs['label'] = 1

cols = ['Geschlecht', 'Geburtsjahr', 'Nationalitaet', 'Geburtsland', 'Nettoeinkommen Selbst',
        'Nettoeinkommen Haushalt', 'Personen im Haushalt', 'Berufsgruppe',
       'Resilienz', 'Wahlteilnahme', 'Wahlabsicht', 'Hoechster Bildungsabschluss',
       'Familienstand', 'Erwerbstaetigkeit'] 
        
de = 'Aktiv', 'Schlechter Schlaf', 'Leben genießen', 'Alles anstrengend', 'Berufliche Ausbildung'

scaler = StandardScaler()
scaler.fit(pd.concat([gesis, gbs], sort = False)[cols].values)
gesis[cols] = scaler.transform(gesis[cols]) 
gbs[cols] = scaler.transform(gbs[cols]) 

'''
Desinteresse Politiker
'Wach', 'Zurueckhaltend', 'Zufriedenheit Wahlergebnis',
'leicht Vertrauen', 'Faulheit', 'Entspannt',
'wenig kuenstlerisches Interesse', 'Gesellig',
'Andere kritisieren',
'Schlechter Schlaf', 'Leben genießen',
'Zu Nichts aufraffen', 'Alles anstrengend', '
'Gruendlich', 'Nervoes', 'Phantasievoll', 'Optimismus Zukunft
'''



"\nDesinteresse Politiker\n'Wach', 'Zurueckhaltend', 'Zufriedenheit Wahlergebnis',\n'leicht Vertrauen', 'Faulheit', 'Entspannt',\n'wenig kuenstlerisches Interesse', 'Gesellig',\n'Andere kritisieren',\n'Schlechter Schlaf', 'Leben genießen',\n'Zu Nichts aufraffen', 'Alles anstrengend', '\n'Gruendlich', 'Nervoes', 'Phantasievoll', 'Optimismus Zukunft\n"

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


gbs['Wahlteilnahme'] = [1 if i < 0 else 0 for i in gbs.Wahlteilnahme]

cols = ['Geschlecht', 'Geburtsjahr', 'Geburtsland',
       'Nationalitaet', 'Familienstand', 'Hoechster Bildungsabschluss',
       'Berufliche Ausbildung', 'Erwerbstaetigkeit', 'Berufsgruppe',
       'Personen im Haushalt', 'Nettoeinkommen Selbst',
       'Nettoeinkommen Haushalt', 'Schlechter Schlaf', 'Leben genießen',
       'Zu Nichts aufraffen', 'Alles anstrengend',
       'Desinteresse Politiker', 'Zufriedenheit Leben', 'Aktiv',
       'Verärgert', 'Wach', 'Nervös', 'Ängstlich', 'Zurueckhaltend',
       'leicht Vertrauen', 'Faulheit', 'Entspannt',
       'wenig kuenstlerisches Interesse', 'Gesellig', 'Andere kritisieren',
       'Gruendlich', 'Nervoes', 'Phantasievoll', 'Druck', 'Optimismus Zukunft',
       'Zufriedenheit Wahlergebnis', 'Resilienz']

train1, train2, test1, test2 = train_test_split(gbs[cols], gbs.Wahlteilnahme, test_size=0.5, random_state=42)

clf = LinearSVC(max_iter=1000000, C=0.01)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [29]:
clf.fit(train1, test1)

preds = clf.predict(train2)

accuracy_score(test2, preds)

0.9551724137931035

In [30]:
clf.fit(train2, test2)

preds = clf.predict(train1)

accuracy_score(test1, preds)

0.9411764705882353

In [34]:
gbs_sampled = pd.read_csv(os.path.join(path, 'data/gbs2.csv'))

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cols = ['Geschlecht', 'Geburtsjahr', 'Geburtsland',
       'Nationalitaet', 'Familienstand', 'Hoechster Bildungsabschluss',
       'Berufliche Ausbildung', 'Erwerbstaetigkeit', 'Berufsgruppe',
       'Personen im Haushalt', 'Nettoeinkommen Selbst',
       'Nettoeinkommen Haushalt', 'Schlechter Schlaf', 'Leben genießen',
       'Zu Nichts aufraffen', 'Alles anstrengend',
       'Desinteresse Politiker', 'Zufriedenheit Leben', 'Aktiv',
       'Verärgert', 'Wach', 'Nervös', 'Ängstlich', 'Zurueckhaltend',
       'leicht Vertrauen', 'Faulheit', 'Entspannt',
       'wenig kuenstlerisches Interesse', 'Gesellig', 'Andere kritisieren',
       'Gruendlich', 'Nervoes', 'Phantasievoll', 'Druck', 'Optimismus Zukunft',
       'Zufriedenheit Wahlergebnis', 'Resilienz']

train1, train2, test1, test2 = train_test_split(gbs_sampled[cols],
                                                gbs_sampled.Wahlteilnahme, test_size=0.5, random_state=42)

clf = LinearSVC(max_iter=1000000, C=0.01)

In [35]:
clf.fit(train1, test1)

preds = clf.predict(train2)

accuracy_score(test2, preds)

0.9448275862068966

In [36]:
clf.fit(train2, test2)

preds = clf.predict(train1)

accuracy_score(test1, preds)

0.9586206896551724

In [None]:
plt.xlabel("removed instances")
plt.plot(auc, len(auc[i])*[0.5], linestyle='--', label='random')
plt.plot(auc for a in auc[i], [a[1] for a in auc[i]], label='GBS')
plt.grid()
plt.legend(loc='lower left')
plt.title('GBS vs GESIS')
plt.savefig('GBS-GESIS___withCV.png')
plt.show()

In [None]:
allens = allensbach[allensbach.label==0]
gbs_a = allensbach[allensbach.label==1]

auc_2 = []

C = [0.001, 0.002, 0.005, 0.008, 0.01, 0.015, 0.02, 0.05, 0.1, 0.2, 0.5, 0.7]
for T in [0.1, 0.15, 0.2]:
    auc_2.append(MRS(nonrep=gbs_a, rep=allens, columns=allensbach_columns,
                     temperature=T, ensemble_size=10, n_drop=10, limit=15, C=C))

In [None]:
for i in range(3):
    #plt.figure(figsize=(12,10), dpi=80)
    plt.plot([a[0] for a in auc_2[i]], [a[1] for a in auc_2[i]], label='GBS')
    plt.xlabel("removed instances")
    
plt.plot([a[0] for a in auc_2[i]], len(auc_2[i])*[0.5], linestyle='--', label='random')
plt.grid()
plt.legend(loc='lower left')
plt.title('Multiple Runs: GBS vs Allensbach')
plt.savefig('GBS-Allensbach___.png')
plt.show()

## Predicting Political Participation

### GBS (Original)

In [None]:
gbs_ori = gbs.copy(deep=True)

gbs_ori.drop('Wahlabsicht', axis=1, inplace=True)
cols.remove('Wahlabsicht')

In [None]:
gbs_ori['Class'] = [False if i < 0 else True for i in gbs_ori["Wahlteilnahme"]]

print(gbs_ori['Class'].value_counts())

X_train, X_test, y_train, y_test = train_test_split(gbs_ori[cols], gbs_ori.Class, test_size=0.33, 
                                                    random_state=42)


tuned_parameters = [{'C': [0.001, 0.002, 0.005, 0.01, 0.05,
                     0.1, 0.2, 0.5, 1, 2 , 5, 10, 25, 50, 
                     75, 100, 150, 200, 500, 1000]}]

clf = GridSearchCV(LinearSVC(C=1, max_iter=1_000_000), tuned_parameters, cv=5)
clf.fit(X_train, y_train)

print(clf.best_params_)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

In [None]:
clf = LinearSVC(C=0.5, random_state=0, tol=1e-5, max_iter=1_000_000)
clf.fit(X_train[cols], y_train)

preds = clf.predict(X_test)

accuracy_score(y_test, preds)

#metrics compare preds and y_test.

#del X_train, y_train, X_test,  y_test

### GBS (MRS-Sampled)

In [None]:
gbs_mrs = nonrep.copy(deep=True)

print(gbs_mrs['Wahlteilnahme'].value_counts())

In [None]:
gbs_mrs['Class'] = [False if i < 0 else True for i in gbs_mrs["Wahlteilnahme"]]

print(gbs_mrs['Class'].value_counts())

X_train, X_test, y_train, y_test = train_test_split(gbs_mrs[cols], gbs_mrs.Class, test_size=0.33, 
                                                    random_state=42)


tuned_parameters = [{'C': [0.001, 0.002, 0.005, 0.01, 0.05,
                     0.1, 0.2, 0.5, 1, 2 , 5, 10, 25, 50, 
                     75, 100, 150, 200, 500, 1000]}]

clf = GridSearchCV(LinearSVC(C=1, max_iter=1_000_000), tuned_parameters, cv=5)
clf.fit(X_train, y_train)

print(clf.best_params_)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))