# Libraries and setup variables

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils import *

%matplotlib inline
sns.set()

### Loading the processed dataset

Here we'll load the data into a dataframe, and run a series of initial exploratory analysis.

In [47]:
df_train = pd.read_csv('../data/interim/preprocessed_train.csv')
df_test  = pd.read_csv('../data/interim/preprocessed_test.csv')
df = pd.concat([df_train, df_test], ignore_index=True)

In [48]:
# Education level, we'll group undergrad and postgrad
df['education'] = df['education'].apply(lambda x: 'college' if x in ['Undergraduate', 'Postgraduate'] else 'no-college')

# Just married or not married categories, simpler
df['marital stat'] = df['marital stat'].apply(lambda x: 'married' if x=='Married' else 'not married')

# Grouping occupations
well_paid_occupations = ['Professional specialty', 'Executive admin and managerial', 'Sales']
df['major occupation code'] = df['major occupation code'].apply(lambda x: 'well-paid occ' if x in well_paid_occupations else 'not well-paid occ')

# Grouping industries
well_paid_industries = ['Other professional services', 'Manufacturing-durable goods', 'Finance insurance and real estate']
df['major industry code'] = df['major industry code'].apply(lambda x: 'well-paid ind' if x in well_paid_industries else 'not well-paid ind')

# Separating householders from others types.
df['detailed household summary in household'] = df['detailed household summary in household'].apply(lambda x: x if x=='Householder' else 'Not householder')

# Private workers (I'm assuming this might be something like self employed)
df['class of worker'] = df['class of worker'].apply(lambda x: x if x=='Private' else 'Other')

# Grouping joint tax filers
df['tax filer stat'] = df['tax filer stat'].apply(lambda x: x if x=='Joint both under 65' else 'Other')

# Dropping the detailed industry and occupation recodes, these are numbers but not really
drop_columns = ['detailed industry recode', 'detailed occupation recode', 'year', 'veterans benefits', 
               "fill inc questionnaire for veteran's admin"]
df = df.drop(columns=drop_columns)

# Train and test 
df['set'] = df['set'].apply(lambda x: 1 if x=='train' else 0)



In [49]:
df = one_hot_df(df, ['salary', 'set'])

In [50]:
# Split the dataset again and save under the processed folder
df_train = df[df['set']==1]
df_test = df[df['set']==0]

df_train.to_csv('../data/processed/train.csv', index=False)
df_test.to_csv('../data/processed/test.csv', index=False)