# Maximal Representative Subsampling

In [3]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
np.seterr(divide = 'ignore') 

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from scipy.stats import kstest

%run utils.py

path = Path(os.getcwd()).parent

<img src="img/overview.png" width="450">


## Attribute Comparison

<img src="img/spreadsheet.png" width="1000">

**IGNORE ATTRIBUTE IF NOT IN GBS.**

**IN GESIS (G)** 
**IN ALLENSBACH (A).**

Each row is labelled with its (survey) *source*.

- *Source*
- Geschlecht **G,A**
- Geburtsjahr **G,A**
- Hoechster Bildungsabschluss
- Erwerbstaetigkeit
- Berufsgruppe
- Optimismus Zukunft
- Resilienz
- Umgang mit Problemen
- Soziale Unterstuetzung
- Bundersland
- Wohnortgroessee
- Schichtzugehoerigkeit
- Arbeitszeit

In [4]:
attributes = ['Geschlecht', 'Geburtsjahr', 'Hoechster Bildungsabschluss', 
              'Erwerbstaetigkeit', 'Berufsgruppe', 'Optimismus Zukunft', 'Resilienz',
              'Umgang mit Problemen', 'Soziale Unterstuetzung', 'Bundersland',
              'Wohnortgroesse', 'Schichtzugehoerigkeit', 'Arbeitszeit', 'Krankheiten']

### Preprocessing: Allensbach

In [5]:
allensbach_cols = ['S01', 'S02', 'S03_rec', 'S05', 'S04_rec',
                   'SOP_final', 'BRS_final', 'ASKU_final', 'V17',
                  'V18', 'V19', 'BL_rec', 'EINW', 'SCHI', 'S06b_rec']

allensbach = pd.read_csv(os.path.join(path, 'data/Allensbach/abs_data.csv'),
                         usecols = allensbach_cols,
                         delimiter=';', low_memory=False)

allensbach.replace(',', '.', regex=True, inplace=True)
allensbach.replace(' ', np.nan, regex=True, inplace=True)
allensbach = allensbach.fillna(allensbach.median()) # quick fix, for the time being.

#allensbach.rename(columns= dict(zip(allensbach_cols, attributes)), inplace=True)

allensbach.head(5)

Unnamed: 0,S01,S02,S05,EINW,SCHI,V17,V18,V19,BRS_final,ASKU_final,SOP_final,S03_rec,S04_rec,BL_rec,S06b_rec
0,2,73,2,8,3,2,1,3,2.5,5.0,7,2,8,5,3
1,2,24,1,5,3,2,2,2,1.833333333,4.0,5,3,3,8,3
2,1,51,1,2,3,1,2,4,1.4,3.666666667,2,3,3,8,4
3,1,57,1,5,3,2,2,2,4.0,4.666666667,6,5,2,9,3
4,1,64,2,4,4,1,3,4,3.0,3.666666667,4,2,1,9,3


- mehrere Messungen zu ein und demselben Attribut. Welchen Wert nehmen?? Beispiel: soziale Unterstuetzung.

### Preprocessing: GBS

In [6]:
gbs1 = pd.read_csv(os.path.join(path, 'data/GBS/gbs1.csv'))
gbs2 = pd.read_csv(os.path.join(path, 'data/GBS/gbs2.csv'))

mapping1 = pd.read_csv(os.path.join(path, 'data/GBS/gbs_map1.csv'),
                                    encoding = "ISO-8859-1", delimiter = ';')
mapping2 = pd.read_csv(os.path.join(path, 'data/GBS/gbs_map2.csv'),
                                    encoding = "ISO-8859-1", delimiter = ';')

# Rename 'GBS-CODE' column
key = 'GBS-CODE'
mapping1 = mapping1.rename(columns={'GBS Code': key})
mapping2 = mapping2.rename(columns={'GBS Code': key})
gbs1 = gbs1.rename(columns={'gbs_code': key})
gbs2 = gbs2.rename(columns={'GBS-Code': key})

# Merge GBS "Umfrage Wellen" and create dataframe
gbs = pd.merge(gbs1, gbs2, how='inner', on=key)
gbs = pd.merge(gbs, mapping1, how='left', on=key)
gbs = pd.merge(gbs, mapping2, how='left', on=key)

gbs = gbs[['am01', 'am02_01', 'am14', 'am17', 'am22',
         'lo01_04', 'br01_06',
         'sw02_01', 'sw02_02', 'sw02_03', 'sw02_04', 'sw02_05', 
         'sw02_06', 'sw02_07', 'sw02_08', 'sw02_09', 'sw02_10']]

del mapping1, mapping2, gbs1, gbs2

# TODO: Combine matching rows instead of selecting every 2nd.
gbs = gbs.iloc[::2, :]
gbs.head(5)

Unnamed: 0,am01,am02_01,am14,am17,am22,lo01_04,br01_06,sw02_01,sw02_02,sw02_03,sw02_04,sw02_05,sw02_06,sw02_07,sw02_08,sw02_09,sw02_10
0,männlich,1979-04-02,"Abitur, allgemeine oder fachgebundene Hochschu...",1.0,"Selbstständige (im Handel, im Gastgewerbe, im ...",2.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,2.0,1.0,3.0,3.0
2,weiblich,1945-10-29,Hochschulabschluss,1.0,"Beamter/Beamtin, Richter/-in, Berufssoldat/-in",4.0,2.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0
4,männlich,1949-09-06,Hochschulabschluss,1.0,keine passende Angabe,4.0,4.0,3.0,3.0,3.0,-1.0,-1.0,3.0,3.0,3.0,3.0,3.0
6,weiblich,1992-07-31,Hochschulabschluss,1.0,Angestellte(r),2.0,2.0,2.0,3.0,1.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0
8,weiblich,1984-03-01,Hauptschulabschluss/Volksschulabschluss,1.0,Angestellte(r),3.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


### Preprocessing: GESIS

We might need this later.

In [7]:
gesis1 = pd.read_csv(os.path.join(path, 'data/GESIS/ZA5665_a1_a11-a12_v22-0-0.dta.csv'), 
                     engine='python', encoding = "ISO-8859-1")
gesis2 = pd.read_csv(os.path.join(path, 'data/GESIS/ZA5665_a1_ca-cf_v22-0-0.dta.csv'), 
                     engine='python', encoding = "ISO-8859-1")
gesis3 = pd.read_csv(os.path.join(path, 'data/GESIS/ZA5665_a1_ba-bf_v22-0-0.dta.csv'), 
                     engine='python', encoding = "ISO-8859-1")
gesis4 = pd.read_csv(os.path.join(path, 'data/GESIS/ZA5665_a1_aa-ac_v22-0-0 (1).dta.csv'), 
                     engine='python', encoding = "ISO-8859-1")

gesis5 = pd.merge(gesis1, gesis2, how='inner', on='z000001a')
gesis6 = pd.merge(gesis3, gesis4, how='inner', on='z000001a')
gesis  = pd.merge(gesis5, gesis6, how='inner', on='z000001a')

del gesis1,gesis2,gesis3,gesis4,gesis5,gesis6

gesis = gesis[['a11d054a', 'a11d056b', 'a11d082b', 'a11d089c', 
              'a11d092a', 'acae051a', 'acae058a', 'acae062a', 
              'acae079a', 'acae089a', 'acae099a', 'acae085a', 
              'acae095a', 'acae105a', 'bdao099a']]
gesis.head(5)

Unnamed: 0,a11d054a,a11d056b,a11d082b,a11d089c,a11d092a,acae051a,acae058a,acae062a,acae079a,acae089a,acae099a,acae085a,acae095a,acae105a,bdao099a
0,Männlich,1946,"Fachhochschulreife, Fachoberschule",Nicht erwerbstätig,Missing by filter,Stimme zu,Weder noch,Missing by design,8,Missing by design,Missing by design,6,Missing by design,Missing by design,Mittelschicht
1,Weiblich,1974,Hauptschulabschluss,Vollzeiterwerbstätig,Angestellte(r),Weder noch,Stimme zu,Missing by design,Missing by design,5,Missing by design,Missing by design,3,Missing by design,Weiß ich nicht
2,Weiblich,1994,Schüler/-in,Nicht erwerbstätig,Missing by filter,Stimme zu,Lehne stark ab,Missing by design,9,Missing by design,Missing by design,6,Missing by design,Missing by design,Unit nonresponse
3,Männlich,1950,"Abitur, allgemeine oder fachgebundene Hochschu...",Vollzeiterwerbstätig,Angestellte(r),Stimme zu,Stimme stark zu,Missing by design,Missing by design,Missing by design,Item nonresponse,Missing by design,Missing by design,6,Keiner dieser Schichten
4,Männlich,1990,Realschulabschluss,Vollzeiterwerbstätig,Angestellte(r),Stimme zu,Missing by design,Lehne ab,8,Missing by design,Missing by design,5,Missing by design,Missing by design,Not reached


- "['a12d021a' 'a12d024a' 'dezg083a'] not in index"

### Preprocessing Allensbach 