# Pandas tutorial, part 1: basic indexing

## 0. Load data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./datasets/titanic3.csv', dtype={'survived':bool, 'body':str})

In [3]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,True,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,True,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,False,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,False,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,False,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


## 1. Indexing by integer location 

### 1.1. Index row(s)

In [4]:
df.iloc[55,:]

pclass                               1
survived                          True
name         Carter, Miss. Lucile Polk
sex                             female
age                                 14
sibsp                                1
parch                                2
ticket                          113760
fare                               120
cabin                          B96 B98
embarked                             S
boat                                 4
body                               NaN
home.dest                Bryn Mawr, PA
Name: 55, dtype: object

Note that slicing via `iloc` is ***exclusive*** (Python style):

In [5]:
df.iloc[3:6,:]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
3,1,False,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,False,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1,True,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,3.0,,"New York, NY"


Non-continuous subsetting, using a list. Note that this subsetting becomes ***inclusive***:

In [6]:
df.iloc[[714,945,946,949,858,674,972],:]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
714,3,True,"Chip, Mr. Chang",male,32.0,0,0,1601,56.4958,,S,C,,"Hong Kong New York, NY"
945,3,True,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S,C,,
946,3,False,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S,,,
949,3,True,"Lang, Mr. Fang",male,26.0,0,0,1601,56.4958,,S,14,,
858,3,True,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S,C,,
674,3,True,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S,C,,"Hong Kong New York, NY"
972,3,False,"Ling, Mr. Lee",male,28.0,0,0,1601,56.4958,,S,,,


### 1.2. Index column(s)

Exclusive column indexing:

In [7]:
df.iloc[[714,945,946,949,858,674,972],2:5]

Unnamed: 0,name,sex,age
714,"Chip, Mr. Chang",male,32.0
945,"Lam, Mr. Ali",male,
946,"Lam, Mr. Len",male,
949,"Lang, Mr. Fang",male,26.0
858,"Hee, Mr. Ling",male,
674,"Bing, Mr. Lee",male,32.0
972,"Ling, Mr. Lee",male,28.0


Inclusive column indexing:

In [8]:
df.iloc[[714,945,946,949,858,674,972],[0,2,4]]

Unnamed: 0,pclass,name,age
714,3,"Chip, Mr. Chang",32.0
945,3,"Lam, Mr. Ali",
946,3,"Lam, Mr. Len",
949,3,"Lang, Mr. Fang",26.0
858,3,"Hee, Mr. Ling",
674,3,"Bing, Mr. Lee",32.0
972,3,"Ling, Mr. Lee",28.0


## 3. Indexing by labels

### 3.1. Index row(s)

In [9]:
df.loc[985,:]

pclass                               3
survived                          True
name         Madsen, Mr. Fridtjof Arne
sex                               male
age                                 24
sibsp                                0
parch                                0
ticket                         C 17369
fare                            7.1417
cabin                              NaN
embarked                             S
boat                                13
body                               NaN
home.dest                          NaN
Name: 985, dtype: object

Note that slicing via `loc` is ***inclusive*** (MATLAB/R style), because we are essentially subsetting "row names":

In [10]:
df.loc[400:403,:]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
400,2,True,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34.0,1,1,28220,32.5,,S,10.0,,"Greenport, NY"
401,2,True,"Duran y More, Miss. Asuncion",female,27.0,1,0,SC/PARIS 2149,13.8583,,C,12.0,,"Barcelona, Spain / Havana, Cuba"
402,2,True,"Duran y More, Miss. Florentina",female,30.0,1,0,SC/PARIS 2148,13.8583,,C,12.0,,"Barcelona, Spain / Havana, Cuba"
403,2,False,"Eitemiller, Mr. George Floyd",male,23.0,0,0,29751,13.0,,S,,,"England / Detroit, MI"


Similarly, we can use a list to subset "row names" with the `.reindex` method (https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html). Note that how `-10` and `-1` give NaNs, because they are not row names:

In [11]:
df.reindex([0,6,76,-10,-1])

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,True,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
6,1.0,True,"Andrews, Miss. Kornelia Theodosia",female,63.0,1.0,0.0,13502,77.9583,D7,S,10.0,,"Hudson, NY"
76,1.0,True,"Compton, Miss. Sara Rebecca",female,39.0,1.0,1.0,PC 17756,83.1583,E49,C,14.0,,"Lakewood, NJ"
-10,,,,,,,,,,,,,,
-1,,,,,,,,,,,,,,


### 3.2. Index column(s)

Continuous column indexing using colon (`:`):

In [12]:
df.loc[379:382,'pclass':'age']

Unnamed: 0,pclass,survived,name,sex,age
379,2,True,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0
380,2,True,"Cook, Mrs. (Selena Rogers)",female,22.0
381,2,False,"Corbett, Mrs. Walter H (Irene Colvin)",female,30.0
382,2,False,"Corey, Mrs. Percy C (Mary Phyllis Elizabeth Mi...",female,


Discrete column indexing using a list:

In [13]:
df.reindex([0,6,76,-10,-1], columns=['survived','name','age'])

Unnamed: 0,survived,name,age
0,True,"Allen, Miss. Elisabeth Walton",29.0
6,True,"Andrews, Miss. Kornelia Theodosia",63.0
76,True,"Compton, Miss. Sara Rebecca",39.0
-10,,,
-1,,,


To index a single column as a DataFrame:

In [14]:
df[['name']].head(6)

Unnamed: 0,name
0,"Allen, Miss. Elisabeth Walton"
1,"Allison, Master. Hudson Trevor"
2,"Allison, Miss. Helen Loraine"
3,"Allison, Mr. Hudson Joshua Creighton"
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)"
5,"Anderson, Mr. Harry"


## 4. Mixed indexing (by label and by integer location)

Here are some possible methods:

In [15]:
df.iloc[350].loc['name']

'Brown, Miss. Edith Eileen'

In [16]:
df.loc[:,'name'].iloc[379]

'Collyer, Mrs. Harvey (Charlotte Annie Tate)'

In [17]:
df['name'].iloc[469]

'Keane, Miss. Nora A'

In [18]:
df.loc[:,'pclass':'age'].iloc[-702:-698]

Unnamed: 0,pclass,survived,name,sex,age
607,3,True,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0
608,3,False,"Adahl, Mr. Mauritz Nils Martin",male,30.0
609,3,False,"Adams, Mr. John",male,26.0
610,3,False,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40.0


However, using "chained indexing" is not recommended, because it can cause unpredicted results:

In [19]:
df['fare'][:7] = -9

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [20]:
df['age'].iloc[4] = -11

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [21]:
df.loc[:,'fare'].iloc[-10:-5] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [22]:
df.loc[:,'age'][-10:-5] = -99

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Instead, use only one indexing command:

In [23]:
df.iloc[34,3] = -1

or:

In [24]:
df.loc[997, df.columns.get_loc('fare')] = -999