# USCensus Datasets Creation

This notebook creates the various cleaned train and test datasets from the USCensus dataset.

In [None]:
import pandas as pd
import itertools
import numpy as np
from cleaner import ErrorCleaner
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# there are only categorical missing values, hence we do not need to test different numerical corrections
mv_repair_methods = ['delete', 'mean-mode', 'mean-dummy']

training_list = [mv_repair_methods]
training_combinations = [p for p in itertools.product(*training_list)]

cleaning_setups_df = pd.DataFrame(training_combinations, columns =['mv_repair'])

cleaning_setups_df = cleaning_setups_df.drop_duplicates()

In [None]:
cleaning_setups_df = cleaning_setups_df.reset_index()
cleaning_setups_df = cleaning_setups_df.drop(['index'], axis = 1)

In [None]:
cleaning_setups_df

In [None]:
uscensus_data = pd.read_csv('uscensus_raw.csv')

In [None]:
np.random.seed(0)
for i in range(20):    
    target = uscensus_data["Age"].copy()
    features = uscensus_data.drop("Age", axis = 1).copy()
    
    X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size = 0.3, random_state = i)

    XY_train = X_train.copy()
    XY_train['Age'] = Y_train

    XY_test = X_test.copy()
    XY_test['Age'] = Y_test
    
    XY_train.to_csv('uscensus_dirty_train_set_' + str(i) + '.csv', index = False)
    XY_test.to_csv('uscensus_dirty_test_set_' + str(i) + '.csv', index = False)
    
    cleaned_train_df = pd.DataFrame(columns = list(cleaning_setups_df.columns) + ['dataset'])
    cleaned_test_df = pd.DataFrame(columns = list(cleaning_setups_df.columns) + ['dataset'])
    
    for j in range(len(cleaning_setups_df)):
        cleaning_setup = cleaning_setups_df.loc[j]
    
        error_cleaner = ErrorCleaner(XY_train, cleaning_setup)
        cleaned_dataset = error_cleaner.clean_all()

        cleaned_df = dict(cleaning_setup)
        cleaned_df['dataset'] = cleaned_dataset

        cleaned_train_df = cleaned_train_df.append(cleaned_df, ignore_index = True)
        
        error_cleaner = ErrorCleaner(XY_test, cleaning_setup)
        cleaned_dataset = error_cleaner.clean_all()

        cleaned_df = dict(cleaning_setup)
        cleaned_df['dataset'] = cleaned_dataset

        cleaned_test_df = cleaned_test_df.append(cleaned_df, ignore_index = True)
        
    cleaned_train_df.to_pickle('uscensus_cleaned_train_df_' + str(i) + '.pkl')
    cleaned_test_df.to_pickle('uscensus_cleaned_test_df_' + str(i) + '.pkl')