In [1]:
import pandas as pd
pd.options.display.max_columns = 500
import numpy as np

import datetime
import warnings
from tqdm import tqdm_notebook as tqdm

# Load Remote Data

In [2]:
headers = ['age',
            'workclass',
            'fnlwgt',
            'education',
            'education-num',
            'marital-status',
            'occupation',
            'relationship',
            'race',
            'sex',
            'capital-gain',
            'capital-loss',
            'hours-per-week',
            'native-country',
            'above-50k']

In [12]:
ADULT = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                   names=headers, index_col=False, na_values=' ?')

In [23]:
print(f'{str(datetime.timedelta(seconds=(end - start)))}')

0:00:41.226311


In [13]:
ADULT.dropna(inplace=True)

In [14]:
ADULT.shape

(30162, 15)

In [15]:
ADULT.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,above-50k
16715,41,Private,183850,HS-grad,9,Divorced,Sales,Not-in-family,White,Male,0,0,50,United-States,<=50K
9444,50,Private,120914,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
8883,41,Self-emp-not-inc,200574,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1977,60,United-States,>50K
24206,43,Private,111949,HS-grad,9,Married-civ-spouse,Other-service,Wife,White,Female,0,0,35,United-States,<=50K
24284,57,Self-emp-not-inc,56480,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,1,United-States,<=50K


In [16]:
to_dummy = []
for col in range(ADULT.shape[1]):
    if ADULT.iloc[:, col].dtype != int:
        to_dummy.append(col)
        u = ADULT.iloc[:, col].unique()
        ADULT.iloc[:, col] = ADULT.iloc[:, col].replace(u, np.arange(len(u)))
        
to_dummy = to_dummy[:-1]

In [17]:
ADULT.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,above-50k
13297,39,2,29814,1,9,1,10,1,0,0,0,0,40,0,0
7333,21,2,68358,5,10,0,5,3,0,1,0,0,20,0,0
7293,34,2,118941,2,7,0,4,0,0,1,0,0,40,38,0
23807,25,2,95691,1,9,0,4,4,0,1,0,0,30,13,0
30469,27,1,259873,1,9,1,4,1,1,0,0,0,60,0,1


In [21]:
from sklearn.preprocessing import StandardScaler

In [27]:
ss = StandardScaler()

ADULT_ss = pd.DataFrame(ss.fit_transform(ADULT), columns=headers)
ADULT_ss.iloc[:, -1] = (ADULT_ss.iloc[:, -1] > 0).astype(int)

In [28]:
ADULT_ss.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,above-50k
29148,0.118931,2.022591,0.984515,-0.984162,1.128918,-0.06165,-0.47073,-0.365912,-0.345861,-0.692806,-0.147445,-0.218586,-1.747213,-0.239238,0
25609,-0.490154,2.022591,-1.613723,0.740115,0.736754,-0.06165,1.860179,-0.365912,-0.345861,-0.692806,-0.147445,-0.218586,1.257849,1.175764,1
12169,0.195067,2.022591,1.428172,-0.696782,-0.439738,0.75968,-1.34482,-1.064258,-0.345861,1.443405,-0.147445,-0.218586,-0.077734,-0.239238,0
16738,-0.185611,3.092374,2.813892,1.027494,-2.400559,-0.88298,1.568816,1.729123,1.286922,-0.692806,-0.147445,-0.218586,-0.077734,-0.239238,0
7869,-0.718561,-0.116973,0.163464,-0.696782,-0.439738,-0.06165,0.986088,-0.365912,-0.345861,-0.692806,-0.147445,-0.218586,-0.077734,-0.239238,0


In [29]:
ADULT.iloc[:, -1].mean()

0.24892248524633645

In [36]:
total = ADULT.shape[0]

shuffled = ADULT.sample(frac=1)

split = int(total * 0.2)

print(f'TRAINING SIZE: {split}, TESTING SIZE: {total - split}')

obj_cols = shuffled.iloc[:, to_dummy].astype(object)
shuffled_ohe = pd.get_dummies(obj_cols, prefix=np.array(headers)[to_dummy])
shuffled_ohe = pd.concat([shuffled.drop(shuffled.columns[to_dummy], axis=1).iloc[:, :-1], shuffled_ohe], axis=1)

ss = StandardScaler()

shuffled_ohe = ss.fit_transform(shuffled_ohe)

X_train_val = shuffled.iloc[:split, :-1]
X_train_val_ohe = shuffled_ohe[:split, :]
y_train_val = shuffled.iloc[:split, -1]

X_test = shuffled.iloc[split:, :-1]
X_test_ohe = shuffled_ohe[split:, :]
y_test = shuffled.iloc[split:, -1]

TRAINING SIZE: 6032, TESTING SIZE: 24130


In [37]:
X_train_val.shape, X_train_val_ohe.shape, y_train_val.shape, X_test.shape, X_test_ohe.shape, y_test.shape

((6032, 14), (6032, 104), (6032,), (24130, 14), (24130, 104), (24130,))

In [40]:
X_train_val_ohe[:5, :10]

array([[ 1.0325595 ,  1.16845945, -0.4397382 , -0.14744462, -0.21858598,
         1.17437511, -0.21043313,  3.32710503, -1.68214415, -0.17964832],
       [-0.94696807, -0.37141865,  1.12891838, -0.14744462, -0.21858598,
         0.75700537, -0.21043313, -0.3005616 , -1.68214415, -0.17964832],
       [ 0.11893139,  3.13098292, -0.04757405, -0.14744462, -0.21858598,
        -0.16120806, -0.21043313, -0.3005616 ,  0.59447937, -0.17964832],
       [ 0.34733841,  2.79284201, -0.04757405,  0.40128306, -0.21858598,
        -0.24468201, -0.21043313, -0.3005616 ,  0.59447937, -0.17964832],
       [-0.94696807, -0.84018749, -0.4397382 , -0.14744462, -0.21858598,
         0.59005747, -0.21043313, -0.3005616 ,  0.59447937, -0.17964832]])