In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
# Read the data
df = pd.read_csv('dataset/init_preprocessed_data_without_index.csv')
len(df)

In [None]:
# Get the number of columns with each type
df.dtypes.value_counts()

## Drop rows where PWSTATE2 is 0

In [None]:
num_nostate = (df.PWSTATE2 == 0).values.sum()
original_len = len(df)
print(f"{num_nostate} ({num_nostate/original_len:.2%}) of the rows have no state")

In [None]:
# Remove rows with no state
df = df[df.PWSTATE2 != 0].copy()
assert(len(df) == original_len - num_nostate)
print(f"Removed {num_nostate} rows with no state. {len(df)} rows remain.")

In [None]:
# Drop columns that are now unnecessary

origNumCols = len(df.columns)
droppedCols = []

for col in df.columns:
    unique = df[col].unique()
    if(len(unique) == 1):
        print(f"Dropping column {col} since it has only one value: {unique[0]}")
        droppedCols.append(col)
    elif(len(unique) == 2 and df[col].isna().values.any()):
        print(f"Warning: Column {col} has two values but you may still want to drop it: {unique[0]} and {unique[1]}")

df.drop(droppedCols, axis=1, inplace=True)
assert(len(df.columns) == origNumCols - len(droppedCols))
print(f"Dropped {len(droppedCols)} columns. {len(df.columns)} columns remain.")

## Drop additional unused columns

In [None]:
df.drop(['YRMARR', 'YRNATUR'], axis=1, inplace=True)

## Test-train split

In [None]:
from sklearn.model_selection import train_test_split

# Use random_state=0 to get the same split every time
train, test = train_test_split(df, test_size=0.15, random_state=0)

In [None]:
len(train)

In [None]:
len(test)

Next, we check that each column in train and test has the same number of unique values as the original column in df.

In [None]:
for col in df.columns:
    # These columns don't need to be checked
    if col in ['SERIAL', 'PERNUM', 'HHWT', 'CLUSTER', 'STRATA', 'PERWT', 'UHRSWORK', 'TRANTIME', 'INCWAGE_CPIU_2010']:
        continue
    
    originalUnique = df[col].unique()
    trainUnique = train[col].unique()
    testUnique = test[col].unique()

    if(len(originalUnique) != len(trainUnique) or len(originalUnique) != len(testUnique)):
        print(f"Warning: Values of {col} in train ({trainUnique}) and test ({testUnique}) are not the same as in the original ({originalUnique})")
        raise Exception("Values of column in train and test are not the same as in the original")
    
    elif(len(originalUnique) == 2):
        for trainNum, testNum in zip(train[col].value_counts().tolist(), test[col].value_counts().tolist()):
            testRatio = testNum / len(test)
            trainRatio = trainNum / len(train)
            if(testRatio > trainRatio+.01 or testRatio < trainRatio-.01):
                print(f"Warning: Ratio of {col} values in train ({trainRatio}) and test ({testRatio}) are not the same")
                raise Exception("Ratio of values of column in train and test are not the same as in the original")

## Save the splits

In [None]:
train.index

In [None]:
train.to_csv('dataset/train_split_partially_preprocessed.csv', index=False)

In [None]:
test.to_csv('dataset/test_split_partially_preprocessed.csv', index=False)