In [None]:
%%bash
pip install pandas-profiling

In [None]:
import os
import pandas as pd
from pandas_profiling import ProfileReport

%matplotlib inline
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [None]:
REGION = 'asia-east1'
BUCKET = '{BUCKET}'
PROJECT = '{PROJECT}'

# Cloud Setup
This section is only required if running on cloud

In [None]:
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [None]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

# Data Profiling
Data profiling is done to better understand the data, and to see if there are any invalid data (e.g. out of bounds data, unexpected data types). No data preprocessing should be done here; it should be done in tf.transform so as to have a consistent data pipeline.

In [None]:
df = pd.read_csv('https://dl.dropboxusercontent.com/s/y7lm7aton223abm/spam.csv')[['v1', 'v2']]
df

In [None]:
ProfileReport(df)

# Split Data
Example uses 80-10-10 split for train, eval and test - change if necessary

In [None]:
RANDOM_SEED = 42
train = df.sample(frac=0.8, random_state=RANDOM_SEED)
eval = df.drop(train.index)
test = eval.sample(frac=0.5, random_state=RANDOM_SEED)
eval = eval.drop(test.index)

In [None]:
def export_datasets(on_cloud=False):
    if on_cloud:
        data_dir = 'gs://{bucket}/spam-classification/data/split'.format(bucket=BUCKET)
    else:
        data_dir = 'data/split'
    
    if not os.path.exists('data'):
        os.mkdir('data')
    if not os.path.exists('data/split'):
        os.mkdir('data/split')
    
    train.to_csv(os.path.join(data_dir, 'train.csv'))
    eval.to_csv(os.path.join(data_dir, 'eval.csv'))
    test.to_csv(os.path.join(data_dir, 'test.csv'))
  
    return
  
export_datasets(on_cloud=False)