# Indexing and selecting data

#### Importing libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:
import warnings
warnings.filterwarnings('ignore')

#### Data

In [None]:
dates = pd.date_range('1/1/2000', periods=8)

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
                  index=dates, columns=['A', 'B', 'C', 'D'])
df

In [None]:
s = df['A']

In [None]:
s[dates[5]]

In [None]:
df

In [None]:
df[['B', 'A']] = df[['A', 'B']]
df

In [None]:
df[['A', 'B']]

In [None]:
 df.loc[:, ['B', 'A']] = df[['A', 'B']]

In [None]:
df[['A', 'B']]

In [None]:
df.loc[:, ['B', 'A']] = df[['A', 'B']].to_numpy()

In [None]:
df[['A', 'B']]

#### Attribute access

In [None]:
sa = pd.Series([1, 2, 3], index=list('abc'))

In [None]:
dfa = df.copy()

In [None]:
sa.b

In [None]:
dfa.A

In [None]:
sa.a = 5

In [None]:
sa

In [None]:
dfa.A = list(range(len(dfa.index)))  # ok if A already exists
dfa

In [None]:
dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column

In [None]:
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
x

In [None]:
x.iloc[1] = {'x': 9, 'y': 99}
x

In [None]:
df = pd.DataFrame({'one': [1., 2., 3.]})
df

In [None]:
df.two = [4, 5, 6]
df

#### Slicing ranges

In [None]:
s[:5]

In [None]:
s[::2]

In [None]:
s[::-1]

In [None]:
s2 = s.copy()

In [None]:
s2[:5] = 0
s2

In [None]:
df[:3]

In [None]:
df[::-1]

#### Selection by label

In [None]:
dfl = pd.DataFrame(np.random.randn(5, 4),
                   columns=list('ABCD'),
                   index=pd.date_range('20130101', periods=5))


In [None]:
dfl

In [None]:
dfl.loc['20130102':'20130104']

In [None]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))
s1

In [None]:
s1.loc['c':]

In [None]:
s1.loc['b']

In [None]:
s1.loc['c':] = 0
s1

In [None]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))
df1

In [None]:
df1.loc[['a', 'b', 'd'], :]

In [None]:
df1.loc['d':, 'A':'C']

In [None]:
df1.loc['a']

In [None]:
df1.loc['a'] > 0

In [None]:
df1.loc[:, df1.loc['a'] > 0.05]

In [None]:
mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean")
mask

In [None]:
df1[mask]

In [None]:
df1.loc['a', 'A']

#### Slicing with labels

In [None]:
s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4])

In [None]:
s.loc[3:5]

In [None]:
s.sort_index()

In [None]:
s.sort_index().loc[1:6]

In [None]:
s = pd.Series(list('abcdef'), index=[0, 3, 2, 5, 4, 2])

In [None]:
s.loc[3:5]

#### Selection by position

In [None]:
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
s1

In [None]:
s1.iloc[:3]

In [None]:
s1.iloc[:3] = 0
s1

In [None]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list(range(0, 12, 2)),
                   columns=list(range(0, 8, 2)))
df1

In [None]:
df1.iloc[:3]

In [None]:
df1.iloc[1:5, 2:4]

In [None]:
df1.iloc[[1, 3, 5], [1, 3]]

In [None]:
df1.iloc[1:3, :]

In [None]:
df1.iloc[:, 1:3]

In [None]:
df1.iloc[1, 1]

In [None]:
df1.iloc[1]

In [None]:
x = list('abcdef')
x

In [None]:
x[4:10]

In [None]:
x[8:10]

In [None]:
s = pd.Series(x)
s

In [None]:
s.iloc[4:10]

In [None]:
s.iloc[8:10]

In [None]:
dfl = pd.DataFrame(np.random.randn(5, 2), columns=list('AB'))
dfl

In [None]:
dfl.iloc[:, 2:3]

In [None]:
dfl.iloc[:, 1:3]

In [None]:
dfl.iloc[4:6]

### Selection by callable

In [None]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))
df1

In [None]:
df1.loc[lambda df: df['A'] > 0, :]

In [None]:
df1.iloc[:, lambda df: [0, 1]]

In [None]:
df1[lambda df: df.columns[0]]

In [None]:
df1['A'].loc[lambda s: s > 0]

In [None]:
bb = pd.read_html("https://github.com/pandas-dev/pandas/blob/master/doc/data/baseball.csv")

In [None]:
bb = pd.DataFrame(np.array(bb).reshape(100,24))

In [None]:
bb = bb.dropna(axis='columns')

In [None]:
bb.columns = ["id", "player","year", "stint", "team", "lg",	"g", 	
                    "ab", "r", "h", "X2b", "X3b", "hr","rbi","sb","cs", "bb", 
                    "so", "ibb", "hbp", "sh", "sf", "gidp"]

In [None]:
bb['id'] = bb.index
bb

In [None]:
bb.loc[lambda df: df['r'] > 6, :].groupby(['year', 'team']).sum()

#### Combining positional and label-based indexing

In [None]:
dfd = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6]},
                   index=list('abc'))
dfd

In [None]:
dfd.loc[dfd.index[[0, 2]], 'A']

In [None]:
dfd.iloc[[0, 2], dfd.columns.get_loc('A')]

In [None]:
dfd.iloc[[0, 2], dfd.columns.get_indexer(['A', 'B'])]

#### Indexing with list with missing labels is deprecated

In [None]:
s = pd.Series([1, 2, 3])
s

In [None]:
s.loc[[1, 2]]

#### Reindexing

In [None]:
s.reindex([1, 2, 3])

In [None]:
labels = [1, 2, 3]

In [None]:
s.loc[s.index.intersection(labels)]

In [None]:
s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])

In [None]:
labels = ['c', 'd']

In [None]:
s.loc[s.index.intersection(labels)].reindex(labels)

#### Selecting random samples

In [None]:
s = pd.Series([0, 1, 2, 3, 4, 5])

In [None]:
s.sample()

In [None]:
s.sample(n=3)

In [None]:
s.sample(frac=0.5)

In [None]:
s = pd.Series([0, 1, 2, 3, 4, 5])

In [None]:
s.sample(n=6, replace=False)

In [None]:
s.sample(n=6, replace=True)

In [None]:
s = pd.Series([0, 1, 2, 3, 4, 5])

In [None]:
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]

In [None]:
s.sample(n=3, weights=example_weights)

In [None]:
example_weights2 = [0.5, 0, 0, 0, 0, 0]

In [None]:
s.sample(n=1, weights=example_weights2)

In [None]:
df2 = pd.DataFrame({'col1': [9, 8, 7, 6],
                    'weight_column': [0.5, 0.4, 0.1, 0]})

In [None]:
df2.sample(n=3, weights='weight_column')

In [None]:
df3 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})

In [None]:
df3.sample(n=1, axis=1)

In [None]:
df4 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})

In [None]:
df4.sample(n=2, random_state=2)

In [None]:
df4.sample(n=2, random_state=2)

#### Setting with enlargement

In [None]:
se = pd.Series([1, 2, 3])
se

In [None]:
se[5] = 5.
se

In [None]:
dfi = pd.DataFrame(np.arange(6).reshape(3, 2),
                   columns=['A', 'B'])
dfi

In [None]:
dfi.loc[:, 'C'] = dfi.loc[:, 'A']
dfi

In [None]:
dfi.loc[3] = 5
dfi

#### Fast scalar value getting and setting

In [None]:
s.iat[5]

In [None]:
dfi.iat[3, 0] = 7
dfi

#### Boolean indexing

In [None]:
s = pd.Series(range(-3, 4))
s

In [None]:
s[s > 0]

In [None]:
s[(s < -1) | (s > 0.5)]

In [None]:
s[~(s < 0)]

In [None]:
dfi[dfi['A'] > 0]

In [None]:
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
                    'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
                    'c': np.random.randn(7)})
df2

In [None]:
criterion = df2['a'].map(lambda x: x.startswith('t'))

In [None]:
df2[criterion]

In [None]:
df2[[x.startswith('t') for x in df2['a']]]

In [None]:
df2[criterion & (df2['b'] == 'x')]

In [None]:
df2.loc[criterion & (df2['b'] == 'x'), 'b':'c']

#### Indexing with isin

In [None]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

In [None]:
s.isin([2, 4, 6])

In [None]:
s[s.isin([2, 4, 6])]

In [None]:
s[s.index.isin([2, 4, 6])]

In [None]:
s.reindex([2, 4, 6])

In [None]:
s_mi = pd.Series(np.arange(6),
                 index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi

In [None]:
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]

In [None]:
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]

In [None]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
                   'ids2': ['a', 'n', 'c', 'n']})
df

In [None]:
values = ['a', 'b', 1, 3]

In [None]:
df.isin(values)

In [None]:
values = {'ids': ['a', 'b'], 'vals': [1, 3]}

In [None]:
df.isin(values)

In [None]:
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}

In [None]:
row_mask = df.isin(values).all(1)

In [None]:
df[row_mask]

#### The where() Method and Masking

In [None]:
s[s > 0]

In [None]:
s.where(s > 0)

In [None]:
s2 = s.copy()

In [None]:
s2[s2 < 0] = 0

In [None]:
df3 = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6],
                    'C': [7, 8, 9]})
df3

In [None]:
df3.where(lambda x: x > 4, lambda x: x + 10)

#### Mask

In [None]:
s.mask(s >= 0)

#### Setting with enlargement conditionally using numpy()

In [None]:
df = pd.DataFrame({'col1': list('ABBC'), 'col2': list('ZZXY')})
df

In [None]:
df['color'] = np.where(df['col2'] == 'Z', 'green', 'red')
df

In [None]:
conditions = [
              (df['col2'] == 'Z') & (df['col1'] == 'A'), 
              (df['col2'] == 'Z') & (df['col1'] == 'B'),   
              (df['col1'] == 'B')          
]
conditions

In [None]:
choices = ['yellow', 'blue', 'purple']

In [None]:
df['color'] = np.select(conditions, choices, default='black')
df

#### The query() Method

In [None]:
n = 10

In [None]:
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df

In [None]:
df[(df['a'] < df['b']) & (df['b'] < df['c'])]

In [None]:
df.query('(a < b) & (b < c)')

In [None]:
df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))

In [None]:
df.index.name = 'a'
df

In [None]:
df.query('a < b and b < c')

In [None]:
df = pd.DataFrame(np.random.randint(n, size=(n, 2)), columns=list('bc'))
df

In [None]:
df.query('index < b < c')

In [None]:
df.query('index > 2')

#### MultiIndex query() Syntax
https://pandas.pydata.org/docs/user_guide/indexing.html#multiindex-query-syntax


In [None]:
n = 10

In [None]:
colors = np.random.choice(['red', 'green'], size=n)
colors

In [None]:
foods = np.random.choice(['eggs', 'ham'], size=n)
foods

In [None]:
index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])

In [None]:
df = pd.DataFrame(np.random.randn(n, 2), index=index)
df

In [None]:
df.query('color == "red"')

In [None]:
df.index.names = [None, None]
df

In [None]:
df.query('ilevel_0 == "red"')

#### query() Use Cases

In [None]:
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df

In [None]:
df2 = pd.DataFrame(np.random.rand(n + 2, 3), columns=df.columns)
df2

In [None]:
expr = '0.0 <= a <= c <= 0.5'

In [None]:
map(lambda frame: frame.query(expr), [df, df2])

#### The in and not in operators

In [None]:
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
                   'c': np.random.randint(5, size=12),
                   'd': np.random.randint(9, size=12)})
df

In [None]:
df.query('a in b')

In [None]:
df[df['a'].isin(df['b'])]

In [None]:
df.query('a not in b')

In [None]:
df[~df['a'].isin(df['b'])]

In [None]:
df.query('a in b and c < d')

In [None]:
df[df['b'].isin(df['a']) & (df['c'] < df['d'])]

#### Special use of the == operator with list objects

In [None]:
df.query('b == ["a", "b", "c"]')

In [None]:
df[df['b'].isin(["a", "b", "c"])]

In [None]:
df.query('c == [1, 2]')

In [None]:
df.query('c != [1, 2]')

In [None]:
df.query('[1, 2] in c')

In [None]:
df.query('[1, 2] not in c')

In [None]:
df[df['c'].isin([1, 2])]

#### Boolean operators

In [None]:
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df

In [None]:
df['bools'] = np.random.rand(len(df)) > 0.5

In [None]:
df.query('~bools')

In [None]:
df.query('not bools')

In [None]:
df.query('not bools') == df[~df['bools']]

In [None]:
shorter = df.query('a < b < c and (not bools) or bools > 2')
shorter

In [None]:
longer = df[(df['a'] < df['b'])
            & (df['b'] < df['c'])
            & (~df['bools'])
            | (df['bools'] > 2)]
longer

In [None]:
shorter == longer

#### Duplicate data

In [None]:
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
                    'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
                    'c': np.random.randn(7)})
df2

In [None]:
df2.duplicated('a')

In [None]:
df2.duplicated('a', keep='last')

In [None]:
df2.duplicated('a', keep=False)

In [None]:
df2.drop_duplicates('a')

In [None]:
df2.drop_duplicates('a', keep='last')

In [None]:
df2.drop_duplicates('a', keep=False)

In [None]:
df2.duplicated(['a', 'b'])

In [None]:
df3 = pd.DataFrame({'a': np.arange(6),
                    'b': np.random.randn(6)},
                   index=['a', 'a', 'b', 'c', 'b', 'a'])
df3

In [None]:
df3.index.duplicated()

In [None]:
df3[~df3.index.duplicated()]

In [None]:
df3[~df3.index.duplicated(keep='last')]

In [None]:
df3[~df3.index.duplicated(keep=False)]

#### Dictionary-like get() method

In [None]:
df = pd.DataFrame({'col': ["A", "A", "B", "B"],
                   'A': [80, 23, np.nan, 22],
                   'B': [80, 55, 76, 67]})
df

In [None]:
melt = df.melt('col')
melt

In [None]:
melt = melt.loc[melt['col'] == melt['variable'], 'value']
melt

In [None]:
melt.reset_index(drop=True)

#### Index objects

In [None]:
index = pd.Index(['e', 'd', 'a', 'b'])
index

In [None]:
pd.Index(['e', 'd', 'a', 'b'], dtype='object')

In [None]:
'd' in index

In [None]:
index = pd.Index(['e', 'd', 'a', 'b'], name='something')

In [None]:
index.name

In [None]:
index = pd.Index(list(range(5)), name='rows')

In [None]:
columns = pd.Index(['A', 'B', 'C'], name='cols')

In [None]:
df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns)
df

In [None]:
df['A']

#### Setting metadata

In [None]:
ind = pd.Index([1, 2, 3])

In [None]:
ind.rename("apple")

In [None]:
ind

In [None]:
ind.set_names(["apple"], inplace=True)

In [None]:
ind.name = "bob"

In [None]:
ind.name = "bob"

In [None]:
ind

In [None]:
index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])
index

In [None]:
index.levels[1]

In [None]:
index.set_levels(["a", "b"], level=1)

#### Set operations on Index objects

In [None]:
a = pd.Index(['c', 'b', 'a'])

In [None]:
b = pd.Index(['c', 'e', 'd'])

In [None]:
a.difference(b)

In [None]:
idx1 = pd.Index([1, 2, 3, 4])

In [None]:
idx2 = pd.Index([2, 3, 4, 5])

In [None]:
idx1.symmetric_difference(idx2)

In [None]:
idx1 = pd.Index([0, 1, 2])

In [None]:
idx2 = pd.Index([0.5, 1.5])

In [None]:
idx1.union(idx2)

#### Missing values

In [None]:
idx1 = pd.Index([1, np.nan, 3, 4])
idx1

In [None]:
idx1.fillna(2)

In [None]:
idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'),
                         pd.NaT,
                         pd.Timestamp('2011-01-03')])
idx2

In [None]:
idx2.fillna(pd.Timestamp('2011-01-02'))

#### Returning a view versus a copy

In [None]:
dfmi = pd.DataFrame([list('abcd'),
                     list('efgh'),
                     list('ijkl'),
                     list('mnop')],
                    columns=pd.MultiIndex.from_product([['one', 'two'],
                                                        ['first', 'second']]))
dfmi

In [None]:
dfmi['one']['second']

In [None]:
dfmi.loc[:, ('one', 'second')]

#### Evaluation order matters

In [None]:
dfb = pd.DataFrame({'a': ['one', 'one', 'two',
                          'three', 'two', 'one', 'six'],
                    'c': np.arange(7)})
dfb

In [None]:
dfb['c'][dfb['a'].str.startswith('o')] = 42
dfb

In [None]:
dfc = pd.DataFrame({'a': ['one', 'one', 'two',
                          'three', 'two', 'one', 'six'],
                    'c': np.arange(7)})
dfc

In [None]:
dfd = dfc.copy()

In [None]:
mask = dfd['a'].str.startswith('o')

In [None]:
dfd.loc[mask, 'c'] = 42
dfd

In [None]:
dfd = dfc.copy()

In [None]:
dfd.loc[2, 'a'] = 11
dfd

In [None]:
dfd = dfc.copy()

In [None]:
dfd['a'][2] = 111
dfd