In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
# helper to display Pandas Table output side by side

from IPython.display import display_html

def highlight(data):
    return ['background-color: yellow' for x in data]

def display_side_by_side(subset, *args):
    html_str=''
    for i, df in enumerate(args):
        if i:
            html_str+=df.style.render()
        else:
            df.style.apply(highlight, subset)
            html_str+=df.style.render()
            
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

# Data selection & Indexing

## Series

In [3]:
series = pd.Series([3, 62, 75, 83, 47, 43, 39, 16, 19, 2])

In [4]:
series

0     3
1    62
2    75
3    83
4    47
5    43
6    39
7    16
8    19
9     2
dtype: int64

### Access by Position / Slice

In [5]:
series[0]

3

In [6]:
series[3:6]

3    83
4    47
5    43
dtype: int64

In [7]:
# series[3:6]
series.iloc[3:6]
# note [] not ()!

3    83
4    47
5    43
dtype: int64

### Access by label

In [8]:
len(series)

10

In [9]:
# set alpha label as new index for the series
series.index = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"][:len(series)]

In [10]:
series

A     3
B    62
C    75
D    83
E    47
F    43
G    39
H    16
I    19
J     2
dtype: int64

In [11]:
series[3:6]
# position, pythonic

D    83
E    47
F    43
dtype: int64

In [12]:
series['D':'F']
# by label: slice includes end! 

D    83
E    47
F    43
dtype: int64

In [13]:
series[['D':'F', 'I':'J']]
# cannot combine multiple ranges

SyntaxError: invalid syntax (1019855344.py, line 1)

In [14]:
pd.concat([series['D':'F'], series['I':'J']])
# concat to combine multiple ranges

D    83
E    47
F    43
I    19
J     2
dtype: int64

In [15]:
# set alpha label as new index for the series
series.index = [x for x in "GATTACAXYZ"][:len(series)]

In [16]:
series

G     3
A    62
T    75
T    83
A    47
C    43
A    39
X    16
Y    19
Z     2
dtype: int64

In [17]:
series.loc['G']

3

In [18]:
series.loc['A']

A    62
A    47
A    39
dtype: int64

In [19]:
series.loc['G':'A']
# non-unique values breaks slicing

KeyError: "Cannot get right slice bound for non-unique label: 'A'"

In [None]:
series.loc['X':'Z']
# while unique values are still slicable in a non-unique index

## DataFrames, 2D Data

In [20]:
df = pd.read_json('./data/sampledf.json')

In [21]:
from IPython import display

In [22]:
# visualisation of below - for presentation
display.display_html(df.style.apply(highlight, subset=pd.IndexSlice[:, 2]))

# column
df[2]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,79,19,21,99,35,59,44,25,75,58
1,25,39,89,66,9,41,6,69,63,3
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51
4,79,60,83,85,16,5,16,69,5,20
5,45,26,73,73,100,60,21,19,95,12
6,12,29,18,98,62,68,92,29,74,96
7,36,32,22,4,66,25,63,51,59,14
8,55,53,89,13,84,87,74,3,2,64
9,46,74,36,54,21,12,68,33,80,25


0     21
1     89
2     31
3    100
4     83
5     73
6     18
7     22
8     89
9     36
Name: 2, dtype: int64

In [23]:
df[2:4]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51


In [24]:
# visualisation of below - for presentation
display.display_html(df.style.apply(highlight, 
                                    subset=pd.IndexSlice[range(2, 4), :]))

# column
df[2]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,79,19,21,99,35,59,44,25,75,58
1,25,39,89,66,9,41,6,69,63,3
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51
4,79,60,83,85,16,5,16,69,5,20
5,45,26,73,73,100,60,21,19,95,12
6,12,29,18,98,62,68,92,29,74,96
7,36,32,22,4,66,25,63,51,59,14
8,55,53,89,13,84,87,74,3,2,64
9,46,74,36,54,21,12,68,33,80,25


0     21
1     89
2     31
3    100
4     83
5     73
6     18
7     22
8     89
9     36
Name: 2, dtype: int64

In [25]:
df.iloc[2:, 2]

2     31
3    100
4     83
5     73
6     18
7     22
8     89
9     36
Name: 2, dtype: int64

In [26]:
# visualisation of below - for presentation
display.display_html(df.style.apply(highlight, subset=pd.IndexSlice[range(2, 4), range(2, 4)]))


# segment
df.iloc[2, :]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,79,19,21,99,35,59,44,25,75,58
1,25,39,89,66,9,41,6,69,63,3
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51
4,79,60,83,85,16,5,16,69,5,20
5,45,26,73,73,100,60,21,19,95,12
6,12,29,18,98,62,68,92,29,74,96
7,36,32,22,4,66,25,63,51,59,14
8,55,53,89,13,84,87,74,3,2,64
9,46,74,36,54,21,12,68,33,80,25


0    37
1    64
2    31
3    69
4    61
5    97
6     5
7    11
8    76
9    57
Name: 2, dtype: int64

In [27]:
# visualisation of below - for presentation
display.display_html(df.style.apply(highlight, subset=pd.IndexSlice[:, range(2, 4)]))

# column slice
df.iloc[:, 2:4]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,79,19,21,99,35,59,44,25,75,58
1,25,39,89,66,9,41,6,69,63,3
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51
4,79,60,83,85,16,5,16,69,5,20
5,45,26,73,73,100,60,21,19,95,12
6,12,29,18,98,62,68,92,29,74,96
7,36,32,22,4,66,25,63,51,59,14
8,55,53,89,13,84,87,74,3,2,64
9,46,74,36,54,21,12,68,33,80,25


Unnamed: 0,2,3
0,21,99
1,89,66
2,31,69
3,100,6
4,83,85
5,73,73
6,18,98
7,22,4
8,89,13
9,36,54


In [28]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,79,19,21,99,35,59,44,25,75,58
1,25,39,89,66,9,41,6,69,63,3
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51
4,79,60,83,85,16,5,16,69,5,20
5,45,26,73,73,100,60,21,19,95,12
6,12,29,18,98,62,68,92,29,74,96
7,36,32,22,4,66,25,63,51,59,14
8,55,53,89,13,84,87,74,3,2,64
9,46,74,36,54,21,12,68,33,80,25


In [29]:
df.index = ["R{:02d}".format(i) for i in range(len(df))]

In [30]:
df.columns = ["C{:02d}".format(i) for i in range(len(df.columns))]

In [31]:
df

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R00,79,19,21,99,35,59,44,25,75,58
R01,25,39,89,66,9,41,6,69,63,3
R02,37,64,31,69,61,97,5,11,76,57
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
R05,45,26,73,73,100,60,21,19,95,12
R06,12,29,18,98,62,68,92,29,74,96
R07,36,32,22,4,66,25,63,51,59,14
R08,55,53,89,13,84,87,74,3,2,64
R09,46,74,36,54,21,12,68,33,80,25


In [32]:
# visualisation of below - for presentation
display.display_html(df.style.apply(highlight, subset=pd.IndexSlice[:, 'C05']))

df['C05']

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R00,79,19,21,99,35,59,44,25,75,58
R01,25,39,89,66,9,41,6,69,63,3
R02,37,64,31,69,61,97,5,11,76,57
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
R05,45,26,73,73,100,60,21,19,95,12
R06,12,29,18,98,62,68,92,29,74,96
R07,36,32,22,4,66,25,63,51,59,14
R08,55,53,89,13,84,87,74,3,2,64
R09,46,74,36,54,21,12,68,33,80,25


R00    59
R01    41
R02    97
R03    80
R04     5
R05    60
R06    68
R07    25
R08    87
R09    12
Name: C05, dtype: int64

In [33]:
# visualisation of below - for presentation
#display(df.style.apply(highlight, subset=pd.IndexSlice[:, 'C05']))

df['R05']

KeyError: 'R05'

In [34]:
# visualisation of below - for presentation
display.display_html(df.style.apply(highlight, subset=pd.IndexSlice['R02':'R05', :]))


df['R02':'R05']

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R00,79,19,21,99,35,59,44,25,75,58
R01,25,39,89,66,9,41,6,69,63,3
R02,37,64,31,69,61,97,5,11,76,57
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
R05,45,26,73,73,100,60,21,19,95,12
R06,12,29,18,98,62,68,92,29,74,96
R07,36,32,22,4,66,25,63,51,59,14
R08,55,53,89,13,84,87,74,3,2,64
R09,46,74,36,54,21,12,68,33,80,25


Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R02,37,64,31,69,61,97,5,11,76,57
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
R05,45,26,73,73,100,60,21,19,95,12


In [35]:
df[['C04', 'C05']]

Unnamed: 0,C04,C05
R00,35,59
R01,9,41
R02,61,97
R03,58,80
R04,16,5
R05,100,60
R06,62,68
R07,66,25
R08,84,87
R09,21,12


In [36]:
# visualisation of below - for presentation
display.display_html(df.style.apply(highlight, subset=pd.IndexSlice['R02':'R05', 'C04':'C05']))


# segment
df.loc['R02':'R05', 'C04':'C05']

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R00,79,19,21,99,35,59,44,25,75,58
R01,25,39,89,66,9,41,6,69,63,3
R02,37,64,31,69,61,97,5,11,76,57
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
R05,45,26,73,73,100,60,21,19,95,12
R06,12,29,18,98,62,68,92,29,74,96
R07,36,32,22,4,66,25,63,51,59,14
R08,55,53,89,13,84,87,74,3,2,64
R09,46,74,36,54,21,12,68,33,80,25


Unnamed: 0,C04,C05
R02,61,97
R03,58,80
R04,16,5
R05,100,60


In [37]:
df.loc['R02':'R05', 'C04':'C05']

Unnamed: 0,C04,C05
R02,61,97
R03,58,80
R04,16,5
R05,100,60


### Excercise

In [38]:
sales_data = pd.read_excel('./data/blooth_sales_data_clean.xlsx')
sales_data.head(5)

Unnamed: 0,name,birthday,customer,orderdate,product,units,unitprice
0,Pasquale,1967-09-02,Electronics Inc,2016-07-17 13:48:03.157,Thriller record,2,13.27
1,India,1968-12-13,Electronics Resource Group,2016-07-06 13:48:03.157,Corolla,26,24458.69
2,Wayne,1992-09-10,East Application Contract Inc,2016-07-22 13:48:03.157,Rubik’s Cube,41,15.79
3,Cori,1986-11-05,Signal Industries,2016-07-23 13:48:03.157,iPhone,16,584.01
4,Chang,1972-04-23,Star Alpha Industries,2016-07-16 13:48:03.157,Harry Potter book,4,25.69


In [39]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   name       1000 non-null   object        
 1   birthday   1000 non-null   datetime64[ns]
 2   customer   1000 non-null   object        
 3   orderdate  1000 non-null   datetime64[ns]
 4   product    1000 non-null   object        
 5   units      1000 non-null   int64         
 6   unitprice  1000 non-null   float64       
dtypes: datetime64[ns](2), float64(1), int64(1), object(3)
memory usage: 54.8+ KB


Select columns two to four (three columns in total)

In [40]:
# Your code here


Select the columns *birthday and name* (together)

In [41]:
# Your code here


Select the rows 2 to 4 (three rows)

In [42]:
# Your code here

Select the rows 55, 77

In [43]:
# Your code here


## Boolean Index

A boolean index is an array of true/false values: [1, 0, 1, 1, 0, 0, 1, …]

! though the index name it's not one of the Pandas Index Types.

In [44]:
df['C04']

R00     35
R01      9
R02     61
R03     58
R04     16
R05    100
R06     62
R07     66
R08     84
R09     21
Name: C04, dtype: int64

In [45]:
df['C04'] > 60

R00    False
R01    False
R02     True
R03    False
R04    False
R05     True
R06     True
R07     True
R08     True
R09    False
Name: C04, dtype: bool

In [46]:
df[df['C04'] > 60]

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R02,37,64,31,69,61,97,5,11,76,57
R05,45,26,73,73,100,60,21,19,95,12
R06,12,29,18,98,62,68,92,29,74,96
R07,36,32,22,4,66,25,63,51,59,14
R08,55,53,89,13,84,87,74,3,2,64


In [47]:
df[(df['C04'] < 60) | (df['C04'] > 80)]  # multiple OR

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R00,79,19,21,99,35,59,44,25,75,58
R01,25,39,89,66,9,41,6,69,63,3
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
R05,45,26,73,73,100,60,21,19,95,12
R08,55,53,89,13,84,87,74,3,2,64
R09,46,74,36,54,21,12,68,33,80,25


In [48]:
df[(df['C04'] < 60) & (df['C04'] % 2 == 0)]  # multiple AND

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20


### Excercise

In [49]:
sales_data = pd.read_excel('./data/blooth_sales_data_clean.xlsx')
sales_data.head(5)

Unnamed: 0,name,birthday,customer,orderdate,product,units,unitprice
0,Pasquale,1967-09-02,Electronics Inc,2016-07-17 13:48:03.157,Thriller record,2,13.27
1,India,1968-12-13,Electronics Resource Group,2016-07-06 13:48:03.157,Corolla,26,24458.69
2,Wayne,1992-09-10,East Application Contract Inc,2016-07-22 13:48:03.157,Rubik’s Cube,41,15.79
3,Cori,1986-11-05,Signal Industries,2016-07-23 13:48:03.157,iPhone,16,584.01
4,Chang,1972-04-23,Star Alpha Industries,2016-07-16 13:48:03.157,Harry Potter book,4,25.69


In [50]:
sales_data.info(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   name       1000 non-null   object        
 1   birthday   1000 non-null   datetime64[ns]
 2   customer   1000 non-null   object        
 3   orderdate  1000 non-null   datetime64[ns]
 4   product    1000 non-null   object        
 5   units      1000 non-null   int64         
 6   unitprice  1000 non-null   float64       
dtypes: datetime64[ns](2), float64(1), int64(1), object(3)
memory usage: 54.8+ KB


Find all rows with exactly 50 units

In [51]:
# Your code here


Find all rows with exactly 50 playstations

In [52]:
# Your code here


## filter

Filter by label or index

In [53]:
df.columns

Index(['C00', 'C01', 'C02', 'C03', 'C04', 'C05', 'C06', 'C07', 'C08', 'C09'], dtype='object')

In [54]:
df.filter(like='R0', axis=0)  # , axis=1 per default

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R00,79,19,21,99,35,59,44,25,75,58
R01,25,39,89,66,9,41,6,69,63,3
R02,37,64,31,69,61,97,5,11,76,57
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
R05,45,26,73,73,100,60,21,19,95,12
R06,12,29,18,98,62,68,92,29,74,96
R07,36,32,22,4,66,25,63,51,59,14
R08,55,53,89,13,84,87,74,3,2,64
R09,46,74,36,54,21,12,68,33,80,25


In [55]:
df.filter(regex='.0[2-4]', axis=0)

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R02,37,64,31,69,61,97,5,11,76,57
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20


### Transpose with .T

In [56]:
df.iloc[2:3]

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R02,37,64,31,69,61,97,5,11,76,57


In [57]:
df.iloc[2:3].T

Unnamed: 0,R02
C00,37
C01,64
C02,31
C03,69
C04,61
C05,97
C06,5
C07,11
C08,76
C09,57


### Formatting with Styler

In [58]:
df = pd.read_json('./data/sampledf.json')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,79,19,21,99,35,59,44,25,75,58
1,25,39,89,66,9,41,6,69,63,3
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51
4,79,60,83,85,16,5,16,69,5,20
5,45,26,73,73,100,60,21,19,95,12
6,12,29,18,98,62,68,92,29,74,96
7,36,32,22,4,66,25,63,51,59,14
8,55,53,89,13,84,87,74,3,2,64
9,46,74,36,54,21,12,68,33,80,25


In [59]:
df.style.highlight_min()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,79,19,21,99,35,59,44,25,75,58
1,25,39,89,66,9,41,6,69,63,3
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51
4,79,60,83,85,16,5,16,69,5,20
5,45,26,73,73,100,60,21,19,95,12
6,12,29,18,98,62,68,92,29,74,96
7,36,32,22,4,66,25,63,51,59,14
8,55,53,89,13,84,87,74,3,2,64
9,46,74,36,54,21,12,68,33,80,25


In [60]:
def odd_or_even(data):
    return [('background-color: green; color:white;' if x%2==0 else 'background-color: orange') 
            for x in data]
df.style.apply(odd_or_even)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,79,19,21,99,35,59,44,25,75,58
1,25,39,89,66,9,41,6,69,63,3
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51
4,79,60,83,85,16,5,16,69,5,20
5,45,26,73,73,100,60,21,19,95,12
6,12,29,18,98,62,68,92,29,74,96
7,36,32,22,4,66,25,63,51,59,14
8,55,53,89,13,84,87,74,3,2,64
9,46,74,36,54,21,12,68,33,80,25
