In [1]:
import pandas as pd
import numpy as np

#### NA handling methods
##### Remove NA rows

In [2]:
from numpy import nan as NA

In [3]:
titanic = pd.read_csv('data/kaggle_titanic/train.csv')
titanic[titanic['Cabin'].isnull()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S


In [4]:
def check_null(dataset):
    null_sum = dataset.isnull().sum()[dataset.isnull().sum()>0]
    print("<{0} {1} {0}>".format("="*35, "Dataset null value count"))
    print(null_sum.sort_values(ascending=False))
    print("<{0}>".format("="*35))

check_null(titanic)

Cabin       687
Age         177
Embarked      2
dtype: int64


In [5]:
titanic['Nulls_Per_Row']=titanic.isnull().sum(axis=1)

In [6]:
titanic[titanic['Nulls_Per_Row']==2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Nulls_Per_Row
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,2
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,2
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,2
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C,2
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,2
29,30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S,2
32,33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.7500,,Q,2
36,37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C,2
42,43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C,2
45,46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.0500,,S,2


#### Drop Nulls - Not a wise option

In [7]:
print("% Data lost due to dropping of nulls :{0}".format( 100 -(titanic.dropna(axis=0).shape[0]/titanic.shape[0] )*100))

% Data lost due to dropping of nulls :79.46127946127946


#### Preferred method for imputing nulls
#### mode for  catergorical features -- Cabin,Embarked
#### mean/median for continuous features --Age

In [8]:
titanic['Embarked'].mode()
titanic['Embarked'].fillna('S',inplace=True)
check_null(titanic)

Cabin    687
Age      177
dtype: int64


In [9]:
titanic['Age'].median()
titanic['Age'].fillna(28,inplace=True)
check_null(titanic)

Cabin    687
dtype: int64


In [10]:
titanic['Cabin'].mode()

0        B96 B98
1    C23 C25 C27
2             G6
dtype: object

In [11]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [12]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [13]:
data['animal_map']=data['food'].map(lambda x: meat_to_animal[x.lower()])
data

Unnamed: 0,food,ounces,animal,animal_map
0,bacon,4.0,pig,pig
1,pulled pork,3.0,pig,pig
2,bacon,12.0,pig,pig
3,Pastrami,6.0,cow,cow
4,corned beef,7.5,cow,cow
5,Bacon,8.0,pig,pig
6,pastrami,3.0,cow,cow
7,honey ham,5.0,pig,pig
8,nova lox,6.0,salmon,salmon


#### Discretization and Binning 

In [16]:
age_bins =[18, 25, 35, 60, 100]
agebin_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
titanic['Age_Bins'] = pd.cut(titanic['Age'], age_bins,labels =agebin_names)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Nulls_Per_Row,Age_Bins
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,Youth
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,MiddleAged
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,YoungAdult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,YoungAdult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,YoungAdult
