# Chapter 6: Index Alignment
## Recipes
* [Examining the Index object](#Examining-the-index)
* [Producing Cartesian products](#Producing-Cartesian-products)
* [Exploding indexes](#Exploding-Indexes)
* [Filling values with unequal indexes](#Filling-values-with-unequal-indexes)
* [Appending columns from different DataFrames](#Appending-columns-from-different-DataFrames)
* [Highlighting the maximum value from each column](#Highlighting-maximum-value-from-each-column)
* [Replicating idxmax with method chaining](#Replicating-idxmax-with-method-chaining)
* [Finding the most common maximum](#Finding-the-most-common-maximum)

In [None]:
import pandas as pd
import numpy as np

# Examining the index

In [None]:
college = pd.read_csv('data/college.csv')
columns = college.columns
columns

In [None]:
columns.values

In [None]:
columns[5]

In [None]:
columns[[1,8,10]]

In [None]:
columns[-7:-4]

In [None]:
columns.min(), columns.max(), columns.isnull().sum()

In [None]:
columns + '_A'

In [None]:
columns > 'G'

In [None]:
columns[1] = 'city'

In [None]:
c1 = columns[:4]
c1

In [None]:
c2 = columns[2:5]
c2

In [None]:
c1.union(c2)

In [None]:
c1 | c2

In [None]:
c1.symmetric_difference(c2)

In [None]:
c1 ^ c2

# Producing Cartesian products

In [None]:
s1 = pd.Series(index=list('aaab'), data=np.arange(4))
s1

In [None]:
s2 = pd.Series(index=list('cababb'), data=np.arange(6))
s2

In [None]:
s1 + s2

## There's more

In [None]:
s1 = pd.Series(index=list('aaabb'), data=np.arange(5))
s2 = pd.Series(index=list('aaabb'), data=np.arange(5))
s1 + s2

In [None]:
s1 = pd.Series(index=list('aaabb'), data=np.arange(5))
s2 = pd.Series(index=list('bbaaa'), data=np.arange(5))
s1 + s2

# Exploding Indexes

In [None]:
employee = pd.read_csv('data/employee.csv', index_col='RACE')
employee.head()

In [None]:
salary1 = employee['BASE_SALARY']
salary2 = employee['BASE_SALARY']
salary1 is salary2

In [None]:
salary1 = employee['BASE_SALARY'].copy()
salary2 = employee['BASE_SALARY'].copy()
salary1 is salary2

In [None]:
salary1 = salary1.sort_index()
salary1.head()

In [None]:
salary2.head()

In [None]:
salary_add = salary1 + salary2

In [None]:
salary_add.head()

In [None]:
salary_add1 = salary1 + salary1
len(salary1), len(salary2), len(salary_add), len(salary_add1)

## There's more...

In [None]:
index_vc = salary1.index.value_counts(dropna=False)
index_vc

In [None]:
index_vc.pow(2).sum()

# Filling values with unequal indexes

In [None]:
baseball_14 = pd.read_csv('data/baseball14.csv', index_col='playerID')
baseball_15 = pd.read_csv('data/baseball15.csv', index_col='playerID')
baseball_16 = pd.read_csv('data/baseball16.csv', index_col='playerID')
baseball_14.head()

In [None]:
baseball_14.index.difference(baseball_15.index)

In [None]:
baseball_14.index.difference(baseball_15.index)

In [None]:
hits_14 = baseball_14['H']
hits_15 = baseball_15['H']
hits_16 = baseball_16['H']
hits_14.head()

In [None]:
(hits_14 + hits_15).head()

In [None]:
hits_14.add(hits_15, fill_value=0).head()

In [None]:
hits_total = hits_14.add(hits_15, fill_value=0).add(hits_16, fill_value=0)
hits_total.head()

In [None]:
hits_total.hasnans

## How it works...

In [None]:
s = pd.Series(index=['a', 'b', 'c', 'd'], data=[np.nan, 3, np.nan, 1])
s

In [None]:
s1 = pd.Series(index=['a', 'b', 'c'], data=[np.nan, 6, 10])
s1

In [None]:
s.add(s1, fill_value=5)

In [None]:
s1.add(s, fill_value=5)

## There's more

In [None]:
df_14 = baseball_14[['G','AB', 'R', 'H']]
df_14.head()

In [None]:
df_15 = baseball_15[['AB', 'R', 'H', 'HR']]
df_15.head()

In [None]:
(df_14 + df_15).head(10).style.highlight_null('yellow')

In [None]:
df_14.add(df_15, fill_value=0).head(10).style.highlight_null('yellow')

# Appending columns from different DataFrames

In [None]:
employee = pd.read_csv('data/employee.csv')
dept_sal = employee[['DEPARTMENT', 'BASE_SALARY']]

In [None]:
dept_sal = dept_sal.sort_values(['DEPARTMENT', 'BASE_SALARY'],
                                ascending=[True, False])

In [None]:
max_dept_sal = dept_sal.drop_duplicates(subset='DEPARTMENT')
max_dept_sal.head()

In [None]:
max_dept_sal = max_dept_sal.set_index('DEPARTMENT')
employee = employee.set_index('DEPARTMENT')

In [None]:
employee['MAX_DEPT_SALARY'] = max_dept_sal['BASE_SALARY']

In [None]:
pd.options.display.max_columns = 6

In [None]:
employee.head()

In [None]:
employee.query('BASE_SALARY > MAX_DEPT_SALARY')

## How it works...

In [None]:
np.random.seed(1234)
random_salary = dept_sal.sample(n=10).set_index('DEPARTMENT')
random_salary

In [None]:
employee['RANDOM_SALARY'] = random_salary['BASE_SALARY']

## There's more...

In [None]:
employee['MAX_SALARY2'] = max_dept_sal['BASE_SALARY'].head(3)

In [None]:
employee.MAX_SALARY2.value_counts()

In [None]:
employee.MAX_SALARY2.isnull().mean()

# Highlighting maximum value from each column

In [None]:
pd.options.display.max_rows = 8

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college.dtypes

In [None]:
college.MD_EARN_WNE_P10.iloc[0]

In [None]:
college.GRAD_DEBT_MDN_SUPP.iloc[0]

In [None]:
college.MD_EARN_WNE_P10.sort_values(ascending=False).head()

In [None]:
cols = ['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
for col in cols:
    college[col] = pd.to_numeric(college[col], errors='coerce')

college.dtypes.loc[cols]

In [None]:
college_n = college.select_dtypes(include=[np.number])
college_n.head() # only numeric columns

In [None]:
criteria = college_n.nunique() == 2
criteria.head()

In [None]:
binary_cols = college_n.columns[criteria].tolist()
binary_cols

In [None]:
college_n2 = college_n.drop(labels=binary_cols, axis='columns')
college_n2.head()

In [None]:
max_cols = college_n2.idxmax()
max_cols

In [None]:
unique_max_cols = max_cols.unique()
unique_max_cols[:5]

In [None]:
college_n2.loc[unique_max_cols].style.highlight_max()

## There's more...

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_').head()
college_ugds.style.highlight_max(axis='columns')

In [None]:
pd.Timedelta(1, unit='Y')

# Replicating idxmax with method chaining

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')

cols = ['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
for col in cols:
    college[col] = pd.to_numeric(college[col], errors='coerce')

college_n = college.select_dtypes(include=[np.number])
criteria = college_n.nunique() == 2
binary_cols = college_n.columns[criteria].tolist()
college_n = college_n.drop(labels=binary_cols, axis='columns')

In [None]:
college_n.max().head()

In [None]:
college_n.eq(college_n.max()).head()

In [None]:
has_row_max = college_n.eq(college_n.max()).any(axis='columns')
has_row_max.head()

In [None]:
college_n.shape

In [None]:
has_row_max.sum()

In [None]:
pd.options.display.max_rows=6

In [None]:
college_n.eq(college_n.max()).cumsum().cumsum()

In [None]:
has_row_max2 = college_n.eq(college_n.max())\
                        .cumsum()\
                        .cumsum()\
                        .eq(1)\
                        .any(axis='columns')
has_row_max2.head()

In [None]:
has_row_max2.sum()

In [None]:
idxmax_cols = has_row_max2[has_row_max2].index
idxmax_cols

In [None]:
set(college_n.idxmax().unique()) == set(idxmax_cols)

## There's more...

In [None]:
%timeit college_n.idxmax().values

In [None]:
%timeit college_n.eq(college_n.max())\
                              .cumsum()\
                              .cumsum()\
                              .eq(1)\
                              .any(axis='columns')\
                              [lambda x: x].index

# Finding the most common maximum

In [None]:
pd.options.display.max_rows= 40

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')
college_ugds.head()

In [None]:
highest_percentage_race = college_ugds.idxmax(axis='columns')
highest_percentage_race.head()

In [None]:
highest_percentage_race.value_counts(normalize=True)

# There's more...

In [None]:
college_black = college_ugds[highest_percentage_race == 'UGDS_BLACK']
college_black = college_black.drop('UGDS_BLACK', axis='columns')
college_black.idxmax(axis='columns').value_counts(normalize=True)