In [183]:
import pandas as pd
import numpy as np

# Data Indexing and Selection

## Part 1: Data Selection in Series

### `Series` as a Dictionary
A `Series` maps an index (keys) to values.

In [184]:
# Create a sample Series
s = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print("Original Series:\n", s)

Original Series:
 a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64


In [185]:
# Accessing an element by index label
print("\ns['b']:", s['b'])


s['b']: 0.5


In [186]:
# Checking for an index label
print("'a' in s:", 'a' in s)

'a' in s: True


In [187]:
# Using .get() for safe access (returns None if not found)
print("s.get('f'):", s.get('f'))
print("s.get('f', 0):", s.get('f', 0)) # With a default value

s.get('f'): None
s.get('f', 0): 0


In [188]:
# Viewing keys and items
print("Index (Keys):\n", s.keys())
print("Items (key-value pairs):\n", list(s.items()))

Index (Keys):
 Index(['a', 'b', 'c', 'd'], dtype='object')
Items (key-value pairs):
 [('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]


In [189]:
# Adding a new element
s['e'] = 1.25
print("Series after adding 'e':\n", s)

Series after adding 'e':
 a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64


### `Series` as a 1D Array
Supports NumPy-style indexing like slicing, masking, and fancy indexing.

In [190]:
# Slicing with explicit index (inclusive)
print("Explicit slice s['a':'c']:\n", s['a':'c'])

Explicit slice s['a':'c']:
 a    0.25
b    0.50
c    0.75
dtype: float64


In [191]:
# Slicing with implicit integer index (exclusive)
print("Implicit slice s[0:2]:\n", s[0:2])

Implicit slice s[0:2]:
 a    0.25
b    0.50
dtype: float64


In [224]:
# Masking (boolean indexing)
print("Masking (s > 0.3) & (s < 0.8):\n", s[(s > 0.3) & (s < 0.8)])

Masking (s > 0.3) & (s < 0.8):
 b    0.50
c    0.75
dtype: float64


In [193]:
# Fancy indexing (passing a list of labels)
print("Fancy indexing s[['a', 'e']]:\n", s[['a', 'e']])

Fancy indexing s[['a', 'e']]:
 a    0.25
e    1.25
dtype: float64


### Indexers: `loc` and `iloc`
Use `loc` for label-based indexing and `iloc` for integer-based indexing to avoid ambiguity.

In [194]:
s_int = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
print("Series with integer index:\n", s_int)

Series with integer index:
 1    a
3    b
5    c
dtype: object


In [195]:
# Standard indexing uses the explicit index label
print("\ns_int[1]:", s_int[1])


s_int[1]: a


In [196]:
# Standard slicing uses the implicit integer position
print("\ns_int[1:3]:\n", s_int[1:3])


s_int[1:3]:
 3    b
5    c
dtype: object


#### Using `loc` (Label-based)

In [197]:
# Get a single item by label
print("s_int.loc[1]:", s_int.loc[1])

s_int.loc[1]: a


In [198]:
# Slice by labels (inclusive)
print("\ns_int.loc[1:3]:\n", s_int.loc[1:3])


s_int.loc[1:3]:
 1    a
3    b
dtype: object


In [199]:
# Set value by label
s_int.loc[1] = 'Z'
print("\nAfter s_int.loc[1] = 'Z':\n", s_int)


After s_int.loc[1] = 'Z':
 1    Z
3    b
5    c
dtype: object


#### Using `iloc` (Integer-position based)

In [200]:
# Get a single item by integer position
print("s_int.iloc[1]:", s_int.iloc[1])

s_int.iloc[1]: b


In [201]:
# Slice by integer position (exclusive)
print("\ns_int.iloc[1:3]:\n", s_int.iloc[1:3])


s_int.iloc[1:3]:
 3    b
5    c
dtype: object


In [202]:
# Set value by integer position
s_int.iloc[2] = 'Y'
print("\nAfter s_int.iloc[2] = 'Y':\n", s_int)


After s_int.iloc[2] = 'Y':
 1    Z
3    b
5    Y
dtype: object


## Part 2: Data Selection in DataFrame

In [203]:
# Create a sample DataFrame
area = pd.Series({'California': 423967, 'Texas': 695662,
                    'New York': 141297, 'Florida': 170312,
                    'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                   'New York': 19651127, 'Florida': 19552860,
                   'Illinois': 12882135})
df = pd.DataFrame({'area':area, 'pop':pop})
df['density'] = df['pop'] / df['area']
print("Original DataFrame:\n", df)

Original DataFrame:
               area       pop     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


### `DataFrame` as a Dictionary
You can access, modify, and add columns like you would with a dictionary.

In [204]:
# Access a single column (returns a Series)
print("df['area']:\n", df['area'])

df['area']:
 California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [205]:
# Access multiple columns (returns a DataFrame)
print("\ndf[['area', 'density']]:\n", df[['area', 'density']])


df[['area', 'density']]:
               area     density
California  423967   90.413926
Texas       695662   38.018740
New York    141297  139.076746
Florida     170312  114.806121
Illinois    149995   85.883763


In [206]:
# Attribute access for columns (works only for valid variable names)
print("\ndf.density:\n", df.density)


df.density:
 California     90.413926
Texas          38.018740
New York      139.076746
Florida       114.806121
Illinois       85.883763
Name: density, dtype: float64


In [207]:
# Add a new column
df['gdp'] = df['pop'] * 50000 # Example GDP calculation
print("\nDataFrame with new 'gdp' column:\n", df)


DataFrame with new 'gdp' column:
               area       pop     density            gdp
California  423967  38332521   90.413926  1916626050000
Texas       695662  26448193   38.018740  1322409650000
New York    141297  19651127  139.076746   982556350000
Florida     170312  19552860  114.806121   977643000000
Illinois    149995  12882135   85.883763   644106750000


In [None]:
# Get the underlying NumPy array
print(df.values) # transformation to NumPy array results in type promotion
print(df.dtypes)

df.values:
 [[4.23967000e+05 3.83325210e+07 9.50000000e+01 1.91662605e+12]
 [6.95662000e+05 2.64481930e+07 3.80187404e+01 1.32240965e+12]
 [1.41297000e+05 1.96511270e+07 1.39076746e+02 9.82556350e+11]
 [1.70312000e+05 1.95528600e+07 1.14806121e+02 9.77643000e+11]
 [1.49995000e+05 1.28821350e+07 8.58837628e+01 6.44106750e+11]]
area         int64
pop          int64
density    float64
gdp          int64
dtype: object


In [209]:
# Transpose the DataFrame
print("\ndf.T:\n", df.T)


df.T:
            California         Texas      New York       Florida      Illinois
area     4.239670e+05  6.956620e+05  1.412970e+05  1.703120e+05  1.499950e+05
pop      3.833252e+07  2.644819e+07  1.965113e+07  1.955286e+07  1.288214e+07
density  9.041393e+01  3.801874e+01  1.390767e+02  1.148061e+02  8.588376e+01
gdp      1.916626e+12  1.322410e+12  9.825564e+11  9.776430e+11  6.441068e+11


### `DataFrame` as a 2D Array
Use `loc` and `iloc` for array-style indexing.

#### Using `iloc` (Integer-position based)

In [211]:
# Select first 3 rows, first 2 columns
print("df.iloc[:3, :2]:\n", df.iloc[:3, :2])

df.iloc[:3, :2]:
               area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127


In [212]:
# Select a single row
print("\nSingle row (df.iloc[1]):\n", df.iloc[1])


Single row (df.iloc[1]):
 area       6.956620e+05
pop        2.644819e+07
density    3.801874e+01
gdp        1.322410e+12
Name: Texas, dtype: float64


In [213]:
# Select a single value
print("\nSingle value (df.iloc[0, 0]):", df.iloc[0, 0])


Single value (df.iloc[0, 0]): 423967


#### Using `loc` (Label-based)

In [214]:
# Select rows up to 'Illinois', and columns up to 'pop'
print("df.loc[:'Illinois', :'pop']:\n", df.loc[:'Illinois', :'pop'])

df.loc[:'Illinois', :'pop']:
               area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135


In [215]:
# Select a single row by label
print("\nSingle row (df.loc['Texas']):\n", df.loc['Texas'])


Single row (df.loc['Texas']):
 area       6.956620e+05
pop        2.644819e+07
density    3.801874e+01
gdp        1.322410e+12
Name: Texas, dtype: float64


In [216]:
# Select a single value by labels
print("\nSingle value (df.loc['Florida', 'density']):", df.loc['Florida', 'density'])


Single value (df.loc['Florida', 'density']): 114.80612053173


In [217]:
# Set a single value by labels
df.loc['California', 'density'] = 95.0
print("\nAfter setting density for California:\n", df.head(1))


After setting density for California:
               area       pop  density            gdp
California  423967  38332521     95.0  1916626050000


### Combining Indexing Methods

In [218]:
# Combine masking and fancy indexing with .loc
print("df.loc[df.density > 100, ['pop', 'density']]:\n", df.loc[df.density > 100, ['pop', 'density']])

df.loc[df.density > 100, ['pop', 'density']]:
                pop     density
New York  19651127  139.076746
Florida   19552860  114.806121


In [219]:
# Combine label and integer indexing (less common, but possible)
# Note: .ix is deprecated. Use .loc or .iloc instead.
# Example of mixing types with loc (columns must be labels)
print("\ndf.loc[df.density > 100, 'pop']:\n", df.loc[df.density > 100, 'pop'])


df.loc[df.density > 100, 'pop']:
 New York    19651127
Florida     19552860
Name: pop, dtype: int64


### Additional Indexing Conventions

In [231]:
# Slicing a DataFrame directly slices the rows by label
print(df)
print("\nDirect slice df['Florida':'Illinois']:\n", df['Florida':'Illinois'])

              area       pop     density            gdp
California  423967  38332521   95.000000  1916626050000
Texas       695662  26448193   38.018740  1322409650000
New York    141297  19651127  139.076746   982556350000
Florida     170312  19552860  114.806121   977643000000
Illinois    149995  12882135   85.883763   644106750000

Direct slice df['Florida':'Illinois']:
             area       pop     density           gdp
Florida   170312  19552860  114.806121  977643000000
Illinois  149995  12882135   85.883763  644106750000


In [228]:
# Slicing a DataFrame with integers slices rows by position
print(df)
print("\nDirect integer slice df[1:3]:\n", df[1:3])

              area       pop     density            gdp
California  423967  38332521   95.000000  1916626050000
Texas       695662  26448193   38.018740  1322409650000
New York    141297  19651127  139.076746   982556350000
Florida     170312  19552860  114.806121   977643000000
Illinois    149995  12882135   85.883763   644106750000

Direct integer slice df[1:3]:
             area       pop     density            gdp
Texas     695662  26448193   38.018740  1322409650000
New York  141297  19651127  139.076746   982556350000


In [229]:
# Direct masking also applies to rows
print(df)
print("\nDirect masking df[df.density > 100]:\n", df[df.density > 100])

              area       pop     density            gdp
California  423967  38332521   95.000000  1916626050000
Texas       695662  26448193   38.018740  1322409650000
New York    141297  19651127  139.076746   982556350000
Florida     170312  19552860  114.806121   977643000000
Illinois    149995  12882135   85.883763   644106750000

Direct masking df[df.density > 100]:
             area       pop     density           gdp
New York  141297  19651127  139.076746  982556350000
Florida   170312  19552860  114.806121  977643000000
