# Chapter 2: Essential DataFrame Operations

## Recipes
* [Selecting multiple DataFrame columns](#Selecting-multiple-DataFrame-columns)
* [Selecting columns with methods](#Selecting-columns-with-methods)
* [Ordering column names sensibly](#Ordering-column-names-sensibly)
* [Operating on the entire DataFrame](#Operating-on-the-entire-DataFrame)
* [Chaining DataFrame methods together](#Chaining-DataFrame-methods-together)
* [Working with operators on a DataFrame](#Working-with-operators-on-a-DataFrame)
* [Comparing missing values](#Comparing-missing-values)
* [Transposing the direction of a DataFrame operation](#Transposing-the-direction-of-a-DataFrame-operation)
* [Determining college campus diversity](#Determining-college-campus-diversity)

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 40

# Selecting multiple DataFrame columns

In [None]:
movie = pd.read_csv('data/movie.csv')
movie_actor_director = movie[['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']]
movie_actor_director.head()

In [None]:
movie[['director_name']].head()

In [None]:
movie['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']

## There's more...

In [None]:
cols =['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']
movie_actor_director = movie[cols]

# Selecting columns with methods

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
movie.get_dtype_counts()

In [None]:
movie.select_dtypes(include=['int']).head()

In [None]:
movie.select_dtypes(include=['number']).head()

In [None]:
movie.filter(like='facebook').head()

In [None]:
movie.filter(regex='\d').head()

In [None]:
movie.filter(items=['actor_1_name', 'asdf']).head()

# Ordering column names sensibly

In [None]:
movie = pd.read_csv('data/movie.csv')

In [None]:
movie.head()

In [None]:
movie.columns

In [None]:
disc_core = ['movie_title','title_year', 'content_rating','genres']
disc_people = ['director_name','actor_1_name', 'actor_2_name','actor_3_name']
disc_other = ['color','country','language','plot_keywords','movie_imdb_link']
cont_fb = ['director_facebook_likes','actor_1_facebook_likes','actor_2_facebook_likes',
           'actor_3_facebook_likes', 'cast_total_facebook_likes', 'movie_facebook_likes']
cont_finance = ['budget','gross']
cont_num_reviews = ['num_voted_users','num_user_for_reviews', 'num_critic_for_reviews']
cont_other = ['imdb_score','duration', 'aspect_ratio', 'facenumber_in_poster']

In [None]:
new_col_order = disc_core + disc_people + disc_other + \
                    cont_fb + cont_finance + cont_num_reviews + cont_other
set(movie.columns) == set(new_col_order)

In [None]:
movie2 = movie[new_col_order]
movie2.head()

# Operating on the entire DataFrame

In [None]:
pd.options.display.max_rows = 8
movie = pd.read_csv('data/movie.csv')
movie.shape

In [None]:
movie.size

In [None]:
movie.ndim

In [None]:
len(movie)

In [None]:
movie.count()

In [None]:
movie.min()

In [None]:
movie.describe()

In [None]:
pd.options.display.max_rows = 10

In [None]:
movie.describe(percentiles=[.01, .3, .99])

In [None]:
pd.options.display.max_rows = 8

In [None]:
movie.isnull().sum()

## There's more...

In [None]:
movie.min(skipna=False)

# Chaining DataFrame methods together

In [None]:
movie = pd.read_csv('data/movie.csv')
movie.isnull().head()

In [None]:
movie.isnull().sum().head()

In [None]:
movie.isnull().sum().sum()

In [None]:
movie.isnull().any().any()

## How it works...

In [None]:
movie.isnull().get_dtype_counts()

## There's more...

In [None]:
movie[['color', 'movie_title', 'color']].max()

In [None]:
movie.select_dtypes(['object']).fillna('').max()

# Working with operators on a DataFrame

## Getting ready...

In [None]:
college = pd.read_csv('data/college.csv')
college + 5

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')

In [None]:
college == 'asdf'

In [None]:
college_ugds_.head()

In [None]:
college_ugds_.head() + .00501

In [None]:
(college_ugds_.head() + .00501) // .01

In [None]:
college_ugds_op_round = (college_ugds_ + .00501) // .01 / 100
college_ugds_op_round.head()

In [None]:
college_ugds_round = (college_ugds_ + .00001).round(2)
college_ugds_round.head()

In [None]:
.045 + .005

In [None]:
college_ugds_op_round.equals(college_ugds_round)

## There's more...

In [None]:
college_ugds_op_round_methods = college_ugds_.add(.00501).floordiv(.01).div(100)

# Comparing missing values

In [None]:
np.nan == np.nan

In [None]:
None == None

In [None]:
5 > np.nan

In [None]:
np.nan > 5

In [None]:
5 != np.nan

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')

In [None]:
college_ugds_.head() == .0019

In [None]:
college_self_compare = college_ugds_ == college_ugds_
college_self_compare.head()

In [None]:
college_self_compare.all()

In [None]:
(college_ugds_ == np.nan).sum()

In [None]:
college_ugds_.isnull().sum()

In [None]:
from pandas.testing import assert_frame_equal

In [None]:
assert_frame_equal(college_ugds_, college_ugds_)

## There's more...

In [None]:
college_ugds_.eq(.0019).head()

# Transposing the direction of a DataFrame operation

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')
college_ugds_.head()

In [None]:
college_ugds_.count()

In [None]:
college_ugds_.count(axis=0)

In [None]:
college_ugds_.count(axis='index')

In [None]:
college_ugds_.count(axis='columns').head()

In [None]:
college_ugds_.sum(axis='columns').head()

In [None]:
college_ugds_.median(axis='index')

## There's more

In [None]:
college_ugds_cumsum = college_ugds_.cumsum(axis=1)
college_ugds_cumsum.head()

In [None]:
college_ugds_cumsum.sort_values('UGDS_HISP', ascending=False)

# Determining college campus diversity

In [None]:
pd.read_csv('data/college_diversity.csv', index_col='School')

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')
college_ugds_.head()

In [None]:
college_ugds_.isnull().sum(axis=1).sort_values(ascending=False).head()

In [None]:
college_ugds_ = college_ugds_.dropna(how='all')

In [None]:
college_ugds_.isnull().sum()

In [None]:
college_ugds_.ge(.15).head()

In [None]:
diversity_metric = college_ugds_.ge(.15).sum(axis='columns')
diversity_metric.head()

In [None]:
diversity_metric.value_counts()

In [None]:
diversity_metric.sort_values(ascending=False).head()

In [None]:
college_ugds_.loc[['Regency Beauty Institute-Austin', 
                          'Central Texas Beauty College-Temple']]

In [None]:
us_news_top = ['Rutgers University-Newark', 
               'Andrews University', 
               'Stanford University', 
               'University of Houston',
               'University of Nevada-Las Vegas']

In [None]:
diversity_metric.loc[us_news_top]

## There's more...

In [None]:
college_ugds_.max(axis=1).sort_values(ascending=False).head(10)

In [None]:
college_ugds_.loc['Talmudical Seminary Oholei Torah']

In [None]:
(college_ugds_ > .01).all(axis=1).any()