# Download data

This is a prerequisite to the subsequent labs. In this Notebook, we download the [Census dataset](https://archive.ics.uci.edu/ml/datasets/adult) to your local directory or a Cloud Storage location.

In [None]:
import os
from tensorflow.io import gfile

In [None]:
WORKSPACE = 'workspace' # you can set to a GCS location
DATA_DIR = os.path.join(WORKSPACE, 'raw_data')

### 1. Create workspace and download data

In [None]:
if gfile.exists(WORKSPACE):
    print("Removing previous workspace...")
    gfile.rmtree(WORKSPACE)

print("Creating new workspace...")
gfile.mkdir(WORKSPACE)
print("Creating data directory...")
gfile.mkdir(DATA_DIR)

TRAIN_DATA_FILE = os.path.join(DATA_DIR,'train.csv')
EVAL_DATA_FILE = os.path.join(DATA_DIR,'eval.csv')

print("Downloading raw data...")
gfile.copy(src='gs://cloud-samples-data/ml-engine/census/data/adult.data.csv', dst=TRAIN_DATA_FILE)
gfile.copy(src='gs://cloud-samples-data/ml-engine/census/data/adult.test.csv', dst=EVAL_DATA_FILE)
print("Data downloaded.")


### 2. Show sample data

In [None]:
!pip install -q gcsfs # required if your data files in GCS

In [None]:
import pandas as pd

HEADER = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
               'marital_status', 'occupation', 'relationship', 'race', 'gender',
               'capital_gain', 'capital_loss', 'hours_per_week',
               'native_country', 'income_bracket']

train_data = pd.read_csv(TRAIN_DATA_FILE, names=HEADER)
print("Instance count: {}".format(len(train_data)))
train_data.head().T

## Next Step: TFDV
Data Analysis and Schema Generation with TensorFlow Data Validation (TFDV)