In [11]:
import numpy as np
import pandas as pd

In [23]:
### Import data
df = pd.read_csv("data/stem.csv")

### Filter data
df = df[df['EMPLOYED'] == 1]
df.drop(columns='EMPLOYED', inplace=True)

### Select which columns to keep
columns_to_keep = ['TEACHSTEM','WORKTYPE_FINAL', 'Xparent', 'ppagect4', 'EDUC4CAT', 'RACE_col', 
                   'PPGENDER', 'HH_INCOME_col', 'PPREG4', 'ETHN1', 'IDEO', 'HARASS1', 'HARASS2', 
                   'HARASS3', 'TECH3', 'TECH6']
df = pd.DataFrame(df, columns=columns_to_keep)

### Purge empty values and 9s
df = df.astype(float) # change to float so we can insert nans (mark for removal)
df = df.replace([' ', '', 9],[np.nan, np.nan, np.nan])
df = df.dropna()

### Cast everything as integers (empty --> NaN)
df = df.astype(int)

### Remapping values for columns with valuable adjacency
adj_cols = ['ETHN1', 'HARASS1', 'HARASS2']
# df.replace({'ETHN1':{1:2, 2:0, 3:1}}, inplace=True) ### (Didn't work) try to fix UFUNC error
df.replace({'ETHN1':{1:1, 2:-1, 3:0}}, inplace=True)
df.replace({'HARASS1':{1:2, 2:1, 3:0}}, inplace=True)
df.replace({'HARASS2':{1:2, 2:1, 3:0}}, inplace=True)

### Remapping values for label columns
label_cols = ['TECH3', 'TECH6']
df.replace({'TECH3':{1:2, 2:1, 3:0}}, inplace=True)
df.replace({'TECH6':{1:2, 2:1, 3:0}}, inplace=True)
# df.replace({'TECH3':{1:1, 2:1, 3:0}}, inplace=True) #binary version
# df.replace({'TECH6':{1:1, 2:1, 3:0}}, inplace=True) #binary version

### Creating 1hot encoding for the rest
onehot_cols = [elem for elem in columns_to_keep if elem not in adj_cols+label_cols]
print(onehot_cols)

for col in onehot_cols:
    values = set(df[col])
    for value in values:
        df[f'{col}-{value}'] = [1 if elem == value else 0 for elem in df[col]]

df.drop(columns=onehot_cols, inplace=True)

### (Didn't work) Convert everything to float to fix UFUNC error
# df = df.astype(float)

### Export data to CSV
df.to_csv('data/stem_processed.csv', index=False)

df

['TEACHSTEM', 'WORKTYPE_FINAL', 'Xparent', 'ppagect4', 'EDUC4CAT', 'RACE_col', 'PPGENDER', 'HH_INCOME_col', 'PPREG4', 'IDEO', 'HARASS3']


Unnamed: 0,ETHN1,HARASS1,HARASS2,TECH3,TECH6,TEACHSTEM-8,TEACHSTEM-1,TEACHSTEM-2,WORKTYPE_FINAL-1,WORKTYPE_FINAL-2,...,PPREG4-2,PPREG4-3,PPREG4-4,IDEO-1,IDEO-2,IDEO-3,IDEO-4,IDEO-5,HARASS3-1,HARASS3-2
0,0,0,0,2,2,1,0,0,1,0,...,0,0,1,0,0,0,0,1,0,1
1,0,0,1,2,1,1,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1
2,-1,0,0,2,2,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,1
4,1,1,0,1,1,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,1
6,0,0,0,0,1,1,0,0,1,0,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4908,1,0,1,2,1,1,0,0,1,0,...,0,0,1,0,0,1,0,0,0,1
4910,0,1,0,1,1,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
4911,0,0,0,1,1,1,0,0,1,0,...,0,1,0,0,0,0,1,0,0,1
4912,-1,0,0,2,1,1,0,0,1,0,...,0,0,1,0,0,0,0,1,0,1


In [29]:
unique, counts = np.unique(df.TECH3, return_counts=True)
print(f'3: {counts}, min accuracy={np.round(1000*max(counts)/sum(counts))/10})')
unique, counts = np.unique(df.TECH6, return_counts=True)
print(f'6: {counts}, min accuracy={np.round(1000*max(counts)/sum(counts))/10})')


3: [1109 1674 1660], min accuracy=37.7)
6: [1378 1739 1326], min accuracy=39.1)
