In [12]:
import numpy as np
import pandas as pd

In [65]:
### Import data
df = pd.read_csv("data/stem.csv")

### Filter data
df = df[df['EMPLOYED'] == 1]
df.drop(columns='EMPLOYED', inplace=True)

### Select which columns to keep
columns_to_keep = ['TEACHSTEM','WORKTYPE_FINAL', 'Xparent', 'ppagect4', 'EDUC4CAT', 'RACE_col', 
                   'PPGENDER', 'HH_INCOME_col', 'PPREG4', 'IDEO', 'HARASS1', 'HARASS2', 'HARASS3', 
                   'TECH3', 'TECH6', 'ETHN1']
df = pd.DataFrame(df, columns=columns_to_keep)

### Cast everything as integers (empty --> NaN)
df.replace([' ', ''],[np.nan, np.nan])
df = df.astype(int)

### Remapping values for columns with valuable adjacency
adj_cols = ['ETHN1', 'HARASS1', 'HARASS2']
df.replace({'ETHN1':{1:1, 2:-1, 3:0}}, inplace=True)
df.replace({'HARASS1':{1:2, 2:1, 3:0}}, inplace=True)
df.replace({'HARASS2':{1:2, 2:1, 3:0}}, inplace=True)

### Creating 1hot encoding for the rest
onehot_cols = [elem for elem in columns_to_keep if elem not in adj_cols]

for col in onehot_cols:
    values = set(df[col])
    for value in values:
        df[f'{col}-{value}'] = [1 if elem == value else 0 for elem in df[col]]

df.drop(columns=onehot_cols, inplace=True)

### Export data to CSV
df.to_csv('data/stem_processed.csv')

df

Unnamed: 0,HARASS1,HARASS2,ETHN1,TEACHSTEM-8,TEACHSTEM-1,TEACHSTEM-2,WORKTYPE_FINAL-1,WORKTYPE_FINAL-2,Xparent-1,Xparent-2,...,HARASS3-2,HARASS3-9,TECH3-1,TECH3-2,TECH3-3,TECH3-9,TECH6-1,TECH6-2,TECH6-3,TECH6-9
0,0,0,0,1,0,0,1,0,0,1,...,1,0,1,0,0,0,1,0,0,0
1,0,1,0,1,0,0,1,0,1,0,...,1,0,1,0,0,0,0,1,0,0
2,0,0,-1,1,0,0,0,1,0,1,...,1,0,1,0,0,0,1,0,0,0
3,1,9,0,0,1,0,1,0,0,1,...,0,0,1,0,0,0,0,1,0,0
4,1,0,1,1,0,0,0,1,0,1,...,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4909,1,1,0,1,0,0,1,0,1,0,...,0,1,0,1,0,0,0,1,0,0
4910,1,0,0,1,0,0,1,0,0,1,...,1,0,0,1,0,0,0,1,0,0
4911,0,0,0,1,0,0,1,0,1,0,...,1,0,0,1,0,0,0,1,0,0
4912,0,0,-1,1,0,0,1,0,1,0,...,1,0,1,0,0,0,0,1,0,0
