In [None]:
cd ..

In [None]:
run __init__.py

In [None]:
adult_feature_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                       'marital-status', 'occupation', 'relationship', 
                       'race', 'sex', 'capital-gain', 'capital-loss', 
                       'hours-per-week', 'native-country', 'target_class']

In [None]:
adult_df = pd.read_csv('data/adult.data.csv', header=None)
adult_df.columns = adult_feature_names
adult_data_df = adult_df.drop('target_class', axis=1)
adult_labels_srs = adult_df['target_class']


In [None]:
adult_labels_srs.unique()

In [None]:
from sklearn.preprocessing import LabelEncoder
adult_labels_srs = pd.Series(LabelEncoder().fit_transform(adult_labels_srs))
adult_labels_srs.unique()

In [None]:
plt.hist(adult_labels_srs);

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
adult_train_data, \
    adult_test_data, \
    adult_train_labels, \
    adult_test_labels = train_test_split(adult_data_df, adult_labels_srs, test_size=0.23, 
                                         random_state=42, stratify=adult_labels_srs)
adult_train_data = adult_train_data.copy()
adult_test_data = adult_test_data.copy()
adult_train_labels = adult_train_labels.copy()
adult_test_labels = adult_test_labels.copy()


In [None]:
fig = plt.figure(figsize=(12,4))
fig.add_subplot(1,2,1)
plt.hist(adult_train_labels, label='train')
plt.hist(adult_test_labels, label='test')
plt.legend()
fig.add_subplot(1,2,2)
sns.distplot(adult_train_labels, label='train')
sns.distplot(adult_test_labels, label='test')
plt.legend()

In [None]:
(adult_train_data.shape,
 adult_test_data.shape,
 adult_train_labels.shape,
 adult_test_labels.shape)

In [None]:
display(adult_train_data.sample(4))
display(adult_test_data.sample(4))

In [None]:
for col in adult_train_data.columns:
    if adult_train_data[col].dtype == object:
        print(col, adult_train_data[col].str.contains(' \?').sum())


In [None]:
for col in adult_test_data.columns:
    if adult_test_data[col].dtype == object:
        print(col, adult_test_data[col].str.contains(' \?').sum())


In [None]:
columns_with_na_vals = ['workclass', 'occupation', 'native-country']

In [None]:
for col in columns_with_na_vals:
    adult_train_data[col] = (adult_train_data[col]
                             .replace(to_replace=' ?', value=np.nan))
    adult_test_data[col] = (adult_test_data[col]
                            .replace(to_replace=' ?', value=np.nan))

In [None]:
for col in adult_train_data.columns:
    if adult_train_data[col].dtype == object:
        print(col, adult_train_data[col].str.contains(' \?').sum())


In [None]:
for col in adult_test_data.columns:
    if adult_test_data[col].dtype == object:
        print(col, adult_test_data[col].str.contains(' \?').sum())


In [None]:
for col in adult_train_data.columns:
    if adult_train_data[col].dtype == object:
        print(col, sum(adult_train_data[col].isnull()))


In [None]:
for col in adult_test_data.columns:
    if adult_test_data[col].dtype == object:
        print(col, sum(adult_test_data[col].isnull()))


In [None]:
workclass_null_indices = set(adult_train_data[adult_train_data['workclass'].isnull()].index)
occupation_null_indices = set(adult_train_data[adult_train_data['occupation'].isnull()].index)
native_country_null_indices = set(adult_train_data[adult_train_data['native-country'].isnull()].index)

In [None]:
len(workclass_null_indices | occupation_null_indices | native_country_null_indices)/len(adult_train_data)

In [None]:
actual_types = ['continuous','categorical','continuous','categorical','continuous','categorical','categorical','categorical','categorical','categorical','continuous','continuous','continuous','categorical']

In [None]:
actual_types = pd.Series(actual_types, index=adult_train_data.dtypes.index)


In [None]:
pd.DataFrame([adult_train_data.dtypes, adult_test_data.dtypes, actual_types], 
             index=['train_set','test_set', 'actual_types']).T


In [None]:
adult_dtypes = adult_train_data.dtypes
adult_numeric_features = list(adult_dtypes[adult_dtypes==int].index)
adult_categorical_features = list(adult_dtypes[adult_dtypes == 'object'].index)
adult_numeric_features, adult_categorical_features

In [None]:
skew_values = list()
adult_train_stats = adult_train_data.describe().T
for num_col in adult_train_stats.index:
    num_col_skew = stats.skew(adult_train_data[num_col])
    skew_values.append(num_col_skew)
adult_train_stats['skew'] = skew_values
adult_train_stats.drop(['count','25%','50%','75%'],axis=1)

In [None]:
def distplot_by_label(feature, target):
    labels = target.unique()
    for label in labels:
        sns.distplot(feature[target==label])
        
        
def ANOVA_by_feature(feature, target):  
    labels = target.unique()
    feature_by_label = [
        feature[target==label] for label in labels
    ]
    return stats.f_oneway(*feature_by_label)

In [None]:
fig = plt.figure(figsize=(20,6))
n = len(adult_numeric_features)
for i, feature in enumerate(adult_numeric_features):
    fig.add_subplot(1, n, i+1)
    distplot_by_label(adult_train_data[feature], adult_train_labels)
    print("{:20} ANOVA p-value: {}".format(feature, ANOVA_by_feature(adult_train_data[feature], adult_train_labels).pvalue))

In [None]:
adult_train_with_label = adult_train_data.copy()
adult_train_with_label['label'] = adult_train_labels
ct = pd.crosstab(index = adult_train_with_label['workclass'], columns=adult_train_with_label['label'])
ct.columns = ['<=$50k','>$50K']
ct

In [None]:
fig = plt.figure(figsize=(20,6))
sns.countplot('workclass', hue='label', data=adult_train_with_label)

In [None]:
fig = plt.figure(figsize=(20,6))
sns.countplot('education', hue='label', data=adult_train_with_label)

In [None]:
adult_train_data.to_pickle('data/adult_train_data.p')
adult_test_data.to_pickle('data/adult_test_data.p')
adult_train_labels.to_pickle('data/adult_train_labels.p')
adult_test_labels.to_pickle('data/adult_test_labels.p')