# Manipulating Data Frames with Pandas

In [75]:
# Import modules
import pandas as pd
import numpy as np

In [76]:
# Read from a file 
df = pd.read_csv('iris.csv')
# df = pd.read_csv(filename, index_col='column-name')

# Print the info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal.length    150 non-null float64
sepal.width     150 non-null float64
petal.length    150 non-null float64
petal.width     150 non-null float64
variety         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [77]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


## Indexing

In [78]:
# Select firts entry in the first column
df.loc[0, 'sepal.length'] # 0 is the row name

5.1

The code below will give the same output.

In [79]:
# Select firts entry in the first column
df.iloc[0, 0] # 0's are index numbers

5.1

In [80]:
# Select two columnns
df[['sepal.length', 'sepal.width']].head()

Unnamed: 0,sepal.length,sepal.width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6


## Slicing

In [81]:
df.loc[0:5, 'sepal.length':'petal.length']

Unnamed: 0,sepal.length,sepal.width,petal.length
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4
5,5.4,3.9,1.7


In [82]:
# Print data in reverse column order.
df.loc[0:5, 'petal.length':'sepal.length':-1]

Unnamed: 0,petal.length,sepal.width,sepal.length
0,1.4,3.5,5.1
1,1.4,3.0,4.9
2,1.3,3.2,4.7
3,1.5,3.1,4.6
4,1.4,3.6,5.0
5,1.7,3.9,5.4


In [83]:
# Slice columns
df.loc[:,:'petal.length'].head()

Unnamed: 0,sepal.length,sepal.width,petal.length
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4


## Filtering

In [84]:
# Create a boolean array of the condition where sepal.length < 4.8
bool_arr = df['sepal.length']<4.8
bool_arr.head()

0    False
1    False
2     True
3     True
4    False
Name: sepal.length, dtype: bool

In [85]:
# Filter the df with a boolean array
df[bool_arr]

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
6,4.6,3.4,1.4,0.3,Setosa
8,4.4,2.9,1.4,0.2,Setosa
13,4.3,3.0,1.1,0.1,Setosa
22,4.6,3.6,1.0,0.2,Setosa
29,4.7,3.2,1.6,0.2,Setosa
38,4.4,3.0,1.3,0.2,Setosa
41,4.5,2.3,1.3,0.3,Setosa
42,4.4,3.2,1.3,0.2,Setosa


In [86]:
df[df.variety=='Setosa'].head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


## Filtering with NaNs

For the purpose of data manipulation, we'll now assign `np.nan` to the _sepal.length_ column where the legth is excatly 4.8. 

In [87]:
df.loc[df['sepal.length']==4.8, 'sepal.length'] = np.nan

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal.length    145 non-null float64
sepal.width     150 non-null float64
petal.length    150 non-null float64
petal.width     150 non-null float64
variety         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [89]:
df.shape

(150, 5)

The method `.dropna()` is used to remove rows with missing data.

In [90]:
#  Remove rows where 'any' of the columns contains missing data
df.dropna(how='any').shape

(145, 5)

In [91]:
#  Remove rows where 'all' of the columns contains missing data
df.dropna(how='all').shape

(150, 5)

We can specify a threshold `thresh` and axis (column) to drop the the whole column if there are less non-missing values than the threshold.

In [92]:
# Drop columns with less than 150 non-missing values
df.dropna(thresh=150, axis='columns').shape

(150, 4)

## Transforming

### Apply

The `.apply()` method can be used  to apply a function along an axis to every element on a Data Frame.

The lengths in dataset is in centimeters. First, let's write a function which converts centimeters to inches.

In [93]:
# Create a function for cm to inch conversion
def cm_to_inch (cm):
    return cm * 0.393701

We can now apply this function over 'sepal.width' column of the iris dataset with the `.apply()` method.

In [94]:
df['sepal.width'].apply(cm_to_inch).head()

0    1.377954
1    1.181103
2    1.259843
3    1.220473
4    1.417324
Name: sepal.width, dtype: float64

In [95]:
df['sepal.width'].head()

0    3.5
1    3.0
2    3.2
3    3.1
4    3.6
Name: sepal.width, dtype: float64

### Map

`.map()` method is used for substituting each value in a Series with another value, that may be derived from a function, a dict or a Series.

We'll create a dictionary for a look-up.

In [96]:
# Unique values in 'variety' column
df.variety.unique()

array(['Setosa', 'Versicolor', 'Virginica'], dtype=object)

In [97]:
# Create a dictionary
colors = {'Setosa':'red', 'Virginica':'blue', 'Versicolor':'green'}

# map the 'variety' column to the a column
df['colors'] = df['variety'].map(colors)
df.colors.unique()

array(['red', 'green', 'blue'], dtype=object)

In [98]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,colors
0,5.1,3.5,1.4,0.2,Setosa,red
1,4.9,3.0,1.4,0.2,Setosa,red
2,4.7,3.2,1.3,0.2,Setosa,red
3,4.6,3.1,1.5,0.2,Setosa,red
4,5.0,3.6,1.4,0.2,Setosa,red


### Vectorized Functions

It is better to use vectorized functions instead of `.apply()` and `.map()` to achieve better performance. NumPy, SciPy and pandas come with a variety of vectorized fast element-wise array functions. These called Ucalled Universal Functions in NumPy.

>The efficiency of several methodologies for applying a function to a Pandas DataFrame, from slowest to fastest:
1. Crude looping over DataFrame rows using indices
2. Looping with `iterrows()`
3. Looping with `apply()`
4. Vectorization with Pandas series
5. Vectorization with NumPy arrays
[source](https://engineering.upside.com/a-beginners-guide-to-optimizing-pandas-code-for-speed-c09ef2c6a4d6)

In [111]:
type(df['sepal.length']), type(df['sepal.length'].values)

(pandas.core.series.Series, numpy.ndarray)

## Advanced Indexing

In [100]:
df.index

RangeIndex(start=0, stop=150, step=1)

Indexes are immutable objects. That is, if we want to change an index, we'll need to change the whole index. We can do that with a list comprehension.

In [101]:
# Modify the index of the DataFrame so that the first observation start with 1 instead of 0
df.index = [i + 1 for i in df.index]

In [102]:
df.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            141, 142, 143, 144, 145, 146, 147, 148, 149, 150],
           dtype='int64', length=150)

We can also use `range` to create an index from a range object.

In [103]:
df.index = range(1,151)

In [104]:
df.index

RangeIndex(start=1, stop=151, step=1)

In [105]:
df.head(3)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,colors
1,5.1,3.5,1.4,0.2,Setosa,red
2,4.9,3.0,1.4,0.2,Setosa,red
3,4.7,3.2,1.3,0.2,Setosa,red


In [106]:
# Assign strings 'Obs #' and '' to Data Frame
df.index.name = 'Obs #'
df.columns.name = 'Vars'

In [120]:
df.head(3)

Vars,sepal.length,sepal.width,petal.length,petal.width,variety,colors
Obs #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,5.1,3.5,1.4,0.2,Setosa,red
2,4.9,3.0,1.4,0.2,Setosa,red
3,4.7,3.2,1.3,0.2,Setosa,red


### Hierarchical Indexing - Multi-Index

In [118]:
# Print first observation (row)
df.loc[1]

Vars
sepal.length       5.1
sepal.width        3.5
petal.length       1.4
petal.width        0.2
variety         Setosa
colors             red
Name: 1, dtype: object

In [119]:
# Print first observation
df.loc[[1]]

Vars,sepal.length,sepal.width,petal.length,petal.width,variety,colors
Obs #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,5.1,3.5,1.4,0.2,Setosa,red


In [124]:
# Print first and third observations
df.loc[[1, 3]]

Vars,sepal.length,sepal.width,petal.length,petal.width,variety,colors
Obs #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,5.1,3.5,1.4,0.2,Setosa,red
3,4.7,3.2,1.3,0.2,Setosa,red


In [123]:
# Print from first to third observations
df.loc[1:3]

Vars,sepal.length,sepal.width,petal.length,petal.width,variety,colors
Obs #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,5.1,3.5,1.4,0.2,Setosa,red
2,4.9,3.0,1.4,0.2,Setosa,red
3,4.7,3.2,1.3,0.2,Setosa,red


In [131]:
# Create a new column with numbers ranging from 1 to 150
df['#'] = range(1,151)

# Set the index to be the columns ['#', 'color']
df_multi = df.set_index(['#', 'colors'])
df_multi = df_multi.sort_index()

df_multi.head(3)

Unnamed: 0_level_0,Vars,sepal.length,sepal.width,petal.length,petal.width,variety
#,colors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,red,5.1,3.5,1.4,0.2,Setosa
2,red,4.9,3.0,1.4,0.2,Setosa
3,red,4.7,3.2,1.3,0.2,Setosa


In [136]:
# Look up data for ""(1,2), red" (first two rows)

df_multi.loc[((1,2), 'red'),:]

Unnamed: 0_level_0,Vars,sepal.length,sepal.width,petal.length,petal.width,variety
#,colors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,red,5.1,3.5,1.4,0.2,Setosa
2,red,4.9,3.0,1.4,0.2,Setosa


In [140]:
# Look up data for all obs in color blue (Virginica)
df_multi.loc[(slice(None), 'blue'),:].head(3)

Unnamed: 0_level_0,Vars,sepal.length,sepal.width,petal.length,petal.width,variety
#,colors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
101,blue,6.3,3.3,6.0,2.5,Virginica
102,blue,5.8,2.7,5.1,1.9,Virginica
103,blue,7.1,3.0,5.9,2.1,Virginica
