<h3>Data Indexing</h3>

In [95]:
# import dependences
import pandas as pd
import numpy as np

In [96]:
# Data Selections in Series
# A Series object acts in many ways like a one-dimensional NumPy array, and in
# many ways like a standard Python dictionary

# Series as a Dictionary
data = pd.Series(
    [0.24, .5, .54, 2.0],
    index = ['a', 'b', 'c', 'd']
)
data

a    0.24
b    0.50
c    0.54
d    2.00
dtype: float64

In [97]:
data['d']

np.float64(2.0)

In [98]:
# Using dictionary-like expressions
'a' in data

True

In [99]:
data.a

np.float64(0.24)

In [100]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [101]:
list(data.items())

[('a', 0.24), ('b', 0.5), ('c', 0.54), ('d', 2.0)]

In [102]:
# Modifying series objects with a dictionary-like syntax
data.d = 90
data

a     0.24
b     0.50
c     0.54
d    90.00
dtype: float64

In [103]:
data.e = 100
data['a'] = .34
data

a     0.34
b     0.50
c     0.54
d    90.00
dtype: float64

In [104]:
# Series as one-Dimensional Array: selection mechanisms such as slices, masking, fancy indexing,
# Slicing
# Final index is included in the slice unlike Python indexing
data['a':'c']

a    0.34
b    0.50
c    0.54
dtype: float64

In [105]:
# data[0:2] # deprecated
data.iloc[0:2]

a    0.34
b    0.50
dtype: float64

In [106]:
# Masking
data[(data > .2) & (data < 1)]

a    0.34
b    0.50
c    0.54
dtype: float64

In [107]:
# Fancy indexing
data[['a', 'c']]

a    0.34
c    0.54
dtype: float64

In [108]:
# Using implicit indexing
data[1:3]

b    0.50
c    0.54
dtype: float64

In [109]:
# Using explicit indexing
data[2]

  data[2]


np.float64(0.54)

In [110]:
# Indexers: loc and iloc
data = pd.Series(
    ['Kampala',
     'Mukono',
     'Jinja'],
    index=[1, 2, 3]
)
data

1    Kampala
2     Mukono
3      Jinja
dtype: object

In [111]:
# loc attribute allows indexing and slicing. Always references the explicit index
# data.loc[1]
# loc gets rows (and/or columns) with particular labels (explicit index)
# iloc gets rows (and/or columns) at integer locations (implicit index), Python-Style index
data.loc[1]

'Kampala'

In [112]:
data.iloc[1]

'Mukono'

In [113]:
# Conclusions
# Explicit is better than implicit.
# loc and iloc are explicit.
# Learn from the Zen of Python
import this

In [114]:
# Data Selection in DataFrames
# A dataframe acts in many ways like a two-dimensional or structured array.
# And in other ways like a dictionary of series structures sharing the same index
hivPrevalence = pd.Series(
    {
    'Central 1': 8.6,
    'Central 2': 7.6,
    'Kampala': 6.9,
    'East-Central': 4.7,
    'Mid-Central': 5.1,
}
)

population = pd.Series(
    {
    'Central 1': 6_904_035,
    'Central 2': 5_485_890,
    'Kampala': 8_000_234,
    'East-Central': 3_000_000,
    'Mid-Central': 4_904_342,
}
)

data = pd.DataFrame({'HIV Prevalence': hivPrevalence,
                     'pop': population
                    })
data

Unnamed: 0,HIV Prevalence,pop
Central 1,8.6,6904035
Central 2,7.6,5485890
Kampala,6.9,8000234
East-Central,4.7,3000000
Mid-Central,5.1,4904342


In [115]:
# Individual Series that make up the DataFrame can be accessed via dictionary-style indexing
data['HIV Prevalence']

Central 1       8.6
Central 2       7.6
Kampala         6.9
East-Central    4.7
Mid-Central     5.1
Name: HIV Prevalence, dtype: float64

In [116]:
# Using attribute-styel access
# This does not work in all cases. Think ...
# NOTE: You should avoid the temptation to try column assignment via attributes.
data.pop is data['pop']

False

In [117]:
# Using dictionary-like syntax to modify data
data['Number of Patients'] = data['pop'] * data['HIV Prevalence']
data

Unnamed: 0,HIV Prevalence,pop,Number of Patients
Central 1,8.6,6904035,59374701.0
Central 2,7.6,5485890,41692764.0
Kampala,6.9,8000234,55201614.6
East-Central,4.7,3000000,14100000.0
Mid-Central,5.1,4904342,25012144.2


In [118]:
# DataFrames as Two-Dimensional Array
# Underlaying data array
data.values

array([[8.60000000e+00, 6.90403500e+06, 5.93747010e+07],
       [7.60000000e+00, 5.48589000e+06, 4.16927640e+07],
       [6.90000000e+00, 8.00023400e+06, 5.52016146e+07],
       [4.70000000e+00, 3.00000000e+06, 1.41000000e+07],
       [5.10000000e+00, 4.90434200e+06, 2.50121442e+07]])

In [119]:
# Accessing a row
data.values[0]

array([8.6000000e+00, 6.9040350e+06, 5.9374701e+07])

In [120]:
# Many array-like operations can be done on the dataframe itself
data.T

Unnamed: 0,Central 1,Central 2,Kampala,East-Central,Mid-Central
HIV Prevalence,8.6,7.6,6.9,4.7,5.1
pop,6904035.0,5485890.0,8000234.0,3000000.0,4904342.0
Number of Patients,59374701.0,41692764.0,55201614.6,14100000.0,25012144.2


In [121]:
# Accessing a column
data['pop']

Central 1       6904035
Central 2       5485890
Kampala         8000234
East-Central    3000000
Mid-Central     4904342
Name: pop, dtype: int64

In [122]:
# Dictionary-like indexing inhibits our abilitity to treat a DataFrame as NumPy array.
data['pop']

Central 1       6904035
Central 2       5485890
Kampala         8000234
East-Central    3000000
Mid-Central     4904342
Name: pop, dtype: int64

In [123]:
data

Unnamed: 0,HIV Prevalence,pop,Number of Patients
Central 1,8.6,6904035,59374701.0
Central 2,7.6,5485890,41692764.0
Kampala,6.9,8000234,55201614.6
East-Central,4.7,3000000,14100000.0
Mid-Central,5.1,4904342,25012144.2


In [124]:
# Using loc and iloc
data.iloc[:3, :1]

Unnamed: 0,HIV Prevalence
Central 1,8.6
Central 2,7.6
Kampala,6.9


In [125]:
data.loc['Central 1': 'East-Central', :'pop']

Unnamed: 0,HIV Prevalence,pop
Central 1,8.6,6904035
Central 2,7.6,5485890
Kampala,6.9,8000234
East-Central,4.7,3000000


In [126]:
# More NumPy-like data access
# data.loc[data['HIV Prevalence'] > 7.5, 'HIV Prevalence'] = 10

In [129]:
# Any of these indexing conventions may also be used to set or modify values
# data.iloc[0, 1] = 500_000
# data

In [130]:
# To build your fluency in Pandas Data Manipulation, spend more time with
# a simple DataFrame and exploring the types of indexing, slicing masking and
# fancy indexing that are allowed by these various approaches

In [131]:
# Additional indexing conventions
# While indexing refers to columns, slicing refers to rows
data['Central 1': 'East-Central']

Unnamed: 0,HIV Prevalence,pop,Number of Patients
Central 1,8.6,500000,59374701.0
Central 2,7.6,5485890,41692764.0
Kampala,6.9,8000234,55201614.6
East-Central,4.7,3000000,14100000.0


In [132]:
data[1:3]

Unnamed: 0,HIV Prevalence,pop,Number of Patients
Central 2,7.6,5485890,41692764.0
Kampala,6.9,8000234,55201614.6


In [135]:
# Masking operations are also interpreted as row-wise
data[data['pop'] > 1_000_000]

Unnamed: 0,HIV Prevalence,pop,Number of Patients
Central 2,7.6,5485890,41692764.0
Kampala,6.9,8000234,55201614.6
East-Central,4.7,3000000,14100000.0
Mid-Central,5.1,4904342,25012144.2
