Skip to content

data preparation example

Khelil Sator edited this page Jun 30, 2019 · 5 revisions

load the titanic dataset

>>> import seaborn as sns
>>> titanic = sns.load_dataset('titanic')
>>> titanic.columns
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'], dtype='object')
>>> titanic.shape
(891, 15)

Check for missing values

>>> titanic.isnull().sum()
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

Fill the missing age values by the mean value

>>> titanic['age'].head(20)
0     22.0
1     38.0
2     26.0
3     35.0
4     35.0
5      NaN
6     54.0
7      2.0
8     27.0
9     14.0
10     4.0
11    58.0
12    20.0
13    39.0
14    14.0
15    55.0
16     2.0
17     NaN
18    31.0
19     NaN
Name: age, dtype: float64
>>> titanic['age'].mean()
29.69911764705882
>>> titanic.fillna(value={'age': titanic['age'].mean()}).age.head(20)
0     22.000000
1     38.000000
2     26.000000
3     35.000000
4     35.000000
5     29.699118
6     54.000000
7      2.000000
8     27.000000
9     14.000000
10     4.000000
11    58.000000
12    20.000000
13    39.000000
14    14.000000
15    55.000000
16     2.000000
17    29.699118
18    31.000000
19    29.699118
Name: age, dtype: float64
>>> titanic['age'] = titanic.fillna(value={'age': titanic['age'].mean()}).age
>>> titanic['age'].head(20)
0     22.000000
1     38.000000
2     26.000000
3     35.000000
4     35.000000
5     29.699118
6     54.000000
7      2.000000
8     27.000000
9     14.000000
10     4.000000
11    58.000000
12    20.000000
13    39.000000
14    14.000000
15    55.000000
16     2.000000
17    29.699118
18    31.000000
19    29.699118
Name: age, dtype: float64
>>> titanic['age'].isnull().sum()
0

Fill the missing embarked values by the most frequent value

>>> titanic['embarked'].mode()[0]
'S'
>>> titanic['embarked'] = titanic.fillna(value={'embarked': titanic['embarked'].mode()[0]}).embarked
>>> titanic['embarked'].isnull().sum()
0

remove some columns

>>> titanic = titanic.drop(columns=['fare'])
>>> titanic.pop('embark_town')
>>> titanic.shape
(891, 13)
>>> titanic.columns
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked', 'class', 'who', 'adult_male', 'deck', 'alive', 'alone'], dtype='object')

one hot encode the column sex

>>> titanic['sex'].head(10) 
0      male
1    female
2    female
3    female
4      male
5      male
6      male
7      male
8    female
9    female
Name: sex, dtype: object
>>> import pandas as pd
>>> pd.get_dummies(titanic['sex'], prefix='sex').head(4)
   sex_female  sex_male
0           0         1
1           1         0
2           1         0
3           1         0
>>> titanic = pd.concat([titanic,pd.get_dummies(titanic['sex'], prefix='sex')],axis=1)
>>> titanic = titanic.drop(columns=['sex'])
>>> titanic.columns
Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'embarked', 'class', 'who', 'adult_male', 'deck', 'alive', 'alone', 'sex_female', 'sex_male'], dtype='object')

find the columns with string (object data type)

>>> titanic.dtypes
survived         int64
pclass           int64
age            float64
sibsp            int64
parch            int64
embarked        object
class         category
who             object
adult_male        bool
deck          category
alive           object
alone             bool
sex_female       uint8
sex_male         uint8
dtype: object
>>> titanic.select_dtypes(include ='object').head(3) 
  embarked    who alive
0        S    man    no
1        C  woman   yes
2        S  woman   yes
>>> for col in titanic.dtypes[titanic.dtypes == 'object'].index:
...     print(col)
... 
embarked
who
alive

one hot encode all columns that use object data type

>>> for col in titanic.dtypes[titanic.dtypes == 'object'].index:
...     titanic = pd.concat([titanic,pd.get_dummies(titanic[col], prefix=col)],axis=1)
...     titanic = titanic.drop(columns=[col])
... 
>>> titanic.columns
Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'class', 'adult_male', 'deck', 'alone', 'sex_female', 'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S', 'who_child', 'who_man', 'who_woman', 'alive_no', 'alive_yes'], dtype='object')
>>> titanic.dtypes
survived         int64
pclass           int64
age            float64
sibsp            int64
parch            int64
class         category
adult_male        bool
deck          category
alone             bool
sex_female       uint8
sex_male         uint8
embarked_C       uint8
embarked_Q       uint8
embarked_S       uint8
who_child        uint8
who_man          uint8
who_woman        uint8
alive_no         uint8
alive_yes        uint8
dtype: object
>>> titanic['embarked_C'].head()
0    0
1    1
2    0
3    0
4    0
Name: embarked_C, dtype: uint8
>>> titanic['embarked_Q'].head()
0    0
1    0
2    0
3    0
4    0
Name: embarked_Q, dtype: uint8
>>> titanic['embarked_S'].head()
0    1
1    0
2    1
3    1
4    1
Name: embarked_S, dtype: uint8

The column pclass (Ticket class) uses integers. 1 = 1st = Upper
2 = 2nd = Middle
3 = 3rd = Lower

>>> titanic['pclass'].head()
0    3
1    1
2    3
3    1
4    3
Name: pclass, dtype: int64

here's how to convert pclass values to strings

>>> titanic['pclass'] = titanic['pclass'].apply(str)
>>> titanic['pclass'].head()
0    3
1    1
2    3
3    1
4    3
Name: pclass, dtype: object 
>>> titanic.dtypes
survived         int64
pclass          object
age            float64
sibsp            int64
parch            int64
class         category
adult_male        bool
deck          category
alone             bool
sex_female       uint8
sex_male         uint8
embarked_C       uint8
embarked_Q       uint8
embarked_S       uint8
who_child        uint8
who_man          uint8
who_woman        uint8
alive_no         uint8
alive_yes        uint8
dtype: object