# Reading, inspecting, cleaning up, saving tabular data

I want to create ensembles of decision trees. It's of two key techniques in machine learning (the other being neural networks with stochastic gradient descent).

In [14]:
# First, let's find a dataset of tabular data.
# I'll choose the titanic dataset from kaggle.
import pathlib
import fastbook
import kaggle
import shutil

def get_kaggle_dataset(comp):
    kaggle_api_credentials = pathlib.Path('~/.kaggle/kaggle.json').expanduser().read_text()
    path = fastbook.URLs.path(comp)
    if path.exists():
        print(path, "already exists.")
        return path
    path.mkdir(parents=True)
    kaggle.api.competition_download_cli(comp, path=path)
    shutil.unpack_archive(str(path/f'{comp}.zip'), str(path))
    return path
    
print(get_kaggle_dataset("titanic"))


Downloading titanic.zip to /home/john/.fastai/archive/titanic


100%|██████████| 34.1k/34.1k [00:00<00:00, 68.1MB/s]


/home/john/.fastai/archive/titanic





In [16]:
path = get_kaggle_dataset("titanic")
print(path.ls())

/home/john/.fastai/archive/titanic already exists.
[Path('/home/john/.fastai/archive/titanic/gender_submission.csv'), Path('/home/john/.fastai/archive/titanic/titanic.zip'), Path('/home/john/.fastai/archive/titanic/train.csv'), Path('/home/john/.fastai/archive/titanic/test.csv')]


In [18]:
# Load the training set with pandas and inspect it.
import pandas as pd

df = pd.read_csv(path/'train.csv', low_memory=False)

In [19]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [23]:
[ (col, len(df[col].unique())) for col in df.columns ]


[('PassengerId', 891),
 ('Survived', 2),
 ('Pclass', 3),
 ('Name', 891),
 ('Sex', 2),
 ('Age', 89),
 ('SibSp', 7),
 ('Parch', 7),
 ('Ticket', 681),
 ('Fare', 248),
 ('Cabin', 148),
 ('Embarked', 4)]

In [24]:
[ (col, df[col].unique()) for col in df.columns if len(df[col].unique()) < 10 ]

[('Survived', array([0, 1])),
 ('Pclass', array([3, 1, 2])),
 ('Sex', array(['male', 'female'], dtype=object)),
 ('SibSp', array([1, 0, 3, 4, 2, 5, 8])),
 ('Parch', array([0, 1, 2, 5, 3, 4, 6])),
 ('Embarked', array(['S', 'C', 'Q', nan], dtype=object))]