# Manipulating Data Frames with Pandas

In [40]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('iris.csv')
# df = pd.read_csv(filename, index_col='column-name')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal.length    150 non-null float64
sepal.width     150 non-null float64
petal.length    150 non-null float64
petal.width     150 non-null float64
variety         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [6]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


## Indexing

In [11]:
# Select firts entry in the first column
df.loc[0, 'sepal.length'] # 0 is the row name

5.1

The code below will give the same output.

In [9]:
# Select firts entry in the first column
df.iloc[0, 0] # 0's are index numbers

5.1

In [49]:
# Select two columnns
df[['sepal.length', 'sepal.width']].head()

Unnamed: 0,sepal.length,sepal.width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6


## Slicing

In [17]:
df.loc[0:5, 'sepal.length':'petal.length']

Unnamed: 0,sepal.length,sepal.width,petal.length
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4
5,5.4,3.9,1.7


In [25]:
# Print data in reverse column order.
df.loc[0:5, 'petal.length':'sepal.length':-1]

Unnamed: 0,petal.length,sepal.width,sepal.length
0,1.4,3.5,5.1
1,1.4,3.0,4.9
2,1.3,3.2,4.7
3,1.5,3.1,4.6
4,1.4,3.6,5.0
5,1.7,3.9,5.4


In [31]:
# Slice columns
df.loc[:,:'petal.length'].head()

Unnamed: 0,sepal.length,sepal.width,petal.length
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4


## Filtering

In [38]:
# Create a boolean array of the condition where sepal.length < 4.8
bool_arr = df['sepal.length']<4.8
bool_arr.head()

0    False
1    False
2     True
3     True
4    False
Name: sepal.length, dtype: bool

In [39]:
# Filter the df with a boolean array
df[bool_arr]

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
6,4.6,3.4,1.4,0.3,Setosa
8,4.4,2.9,1.4,0.2,Setosa
13,4.3,3.0,1.1,0.1,Setosa
22,4.6,3.6,1.0,0.2,Setosa
29,4.7,3.2,1.6,0.2,Setosa
38,4.4,3.0,1.3,0.2,Setosa
41,4.5,2.3,1.3,0.3,Setosa
42,4.4,3.2,1.3,0.2,Setosa


In [43]:
df[df.variety=='Setosa'].head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


## Filtering with NaNs

For the purpose of data manipulation, we'll now assign `np.nan` to the _sepal.length_ column where the legth is excatly 4.8. 

In [44]:
df.loc[df['sepal.length']==4.8, 'sepal.length'] = np.nan

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal.length    145 non-null float64
sepal.width     150 non-null float64
petal.length    150 non-null float64
petal.width     150 non-null float64
variety         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [50]:
df.shape

(150, 5)

The method `.dropna()` is used to remove rows with missing data.

In [51]:
#  Remove rows where 'any' of the columns contains missing data
df.dropna(how='any').shape

(145, 5)

In [52]:
#  Remove rows where 'all' of the columns contains missing data
df.dropna(how='all').shape

(150, 5)

We can specify a threshold `thresh` and axis (column) to drop the the whole column if there are less non-missing values than the threshold.

In [55]:
# Drop columns with less than 150 non-missing values
df.dropna(thresh=150, axis='columns').shape

(150, 4)

## Transforming

### Apply

The `.apply()` method can be used  to apply a function along an axis to every element on a Data Frame.

The lengths in dataset is in centimeters. First, let's write a function which converts centimeters to inches.

In [58]:
def cm_to_inch (cm):
    return cm * 0.393701

We can now apply this function over 'sepal.width' column of the iris dataset with the `.apply()` method.

In [60]:
df['sepal.width'].apply(cm_to_inch).head()

0    1.377954
1    1.181103
2    1.259843
3    1.220473
4    1.417324
Name: sepal.width, dtype: float64

In [61]:
df['sepal.width'].head()

0    3.5
1    3.0
2    3.2
3    3.1
4    3.6
Name: sepal.width, dtype: float64

### Map

`.map()` method is used for substituting each value in a Series with another value, that may be derived from a function, a dict or a Series.

We'll create a dictionary to for a look-up.