# Creating bins with pandas

### Imports

In [1]:
% matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### Example: Creating Age groups for the titanic data set

In [2]:
path = "C:\Users\JC\Desktop\dscamp\dscamp\CS109\HW\Data sets\\train.csv"

titanic = pd.read_csv(path)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


#### 1. Mapping "manually"

In [3]:
#Dropping nan values
ages = titanic.Age.dropna()

#Creating the bins
bins = np.linspace(0,ages.max(),9)

#The mapping function
def map_bin(x, bins):
    kwargs = {}
    if x == max(bins):
        kwargs['right'] = True  #The right = true parameter of digitize tells to include the right limit of the bin
    bin = bins[np.digitize([x], bins, **kwargs)[0]]
    bin_lower = bins[np.digitize([x], bins, **kwargs)[0]-1]
    return '[{0}-{1}]'.format(bin_lower, bin)

titanic = titanic.dropna()

#Creating the column with the bins
titanic["AgeBin"] = titanic.Age.apply(lambda x: map_bin(x,bins))
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBin
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,[30.0-40.0]
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,[30.0-40.0]
6,7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S,[50.0-60.0]
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S,[0.0-10.0]
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S,[50.0-60.0]


#### 2. Using pd.cut method (more convenient)

In [4]:
path = "C:\Users\JC\Desktop\dscamp\dscamp\CS109\HW\Data sets\\train.csv"

titanic = pd.read_csv(path)
titanic.head()

# Creating the labels
bins = ["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80-89"]

#Labels and bins must have the same length
titanic["Age Groups"] = pd.cut(titanic.Age, np.arange(0,91,10), right = False, labels = bins)

titanic[titanic.Age == 80]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Groups
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S,80-89
