# Chapter 1: Pandas Foundations

## Recipes
* [Dissecting the anatomy of a DataFrame](#Dissecting-the-anatomy-of-a-DataFrame)
* [Accessing the main DataFrame components](#Accessing-the-main-DataFrame-components)
* [Understanding data types](#Understanding-data-types)
* [Selecting a single column of data as a Series](#Selecting-a-single-column-of-data-as-a-Series)
* [Calling Series methods](#Calling-Series-methods)
* [Working with operators on a Series](#Working-with-operators-on-a-Series)
* [Chaining Series methods together](#Chaining-Series-methods-together)
* [Making the index meaningful](#Making-the-index-meaningful)
* [Renaming row and column names](#Renaming-row-and-column-names)
* [Creating and deleting columns](#Creating-and-deleting-columns)

In [None]:
import pandas as pd
import numpy as np

# Dissecting the anatomy of a DataFrame

#### Change options to get specific output for book

In [None]:
pd.set_option('max_columns',100,'max_rows', 10)

In [None]:
movie = pd.read_csv('data/movie.csv')
movie.head()

![dataframe anatomy](./images/ch01_dataframe_anatomy.png)

# Accessing the main DataFrame components

In [None]:
columns = movie.columns
index = movie.index
data = movie.values

In [None]:
columns

In [None]:
index

In [None]:
data

In [None]:
type(index)

In [None]:
type(columns)

In [None]:
type(data)

In [None]:
issubclass(pd.RangeIndex, pd.Index)

## There's more

In [None]:
index.values

In [None]:
columns.values

# Understanding data types

In [None]:
movie = pd.read_csv('data/movie.csv')

In [None]:
movie.dtypes

In [None]:
movie.get_dtype_counts()

# Selecting a single column of data as a Series

In [None]:
movie = pd.read_csv('data/movie.csv')

In [None]:
movie['director_name']

In [None]:
movie.director_name

In [None]:
type(movie['director_name'])

## There's more

In [None]:
director = movie['director_name'] # save Series to variable
director.name

In [None]:
director.to_frame().head()

# Calling Series methods

## Getting ready...

In [None]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

In [None]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

In [None]:
len(s_attr_methods & df_attr_methods)

## How to do it...

In [None]:
movie = pd.read_csv('data/movie.csv')
director = movie['director_name']
actor_1_fb_likes = movie['actor_1_facebook_likes']

In [None]:
director.head()

In [None]:
actor_1_fb_likes.head()

In [None]:
pd.set_option('max_rows', 8)
director.value_counts()

In [None]:
actor_1_fb_likes.value_counts()

In [None]:
director.size

In [None]:
director.shape

In [None]:
len(director)

In [None]:
director.count()

In [None]:
actor_1_fb_likes.count()

In [None]:
actor_1_fb_likes.quantile()

In [None]:
actor_1_fb_likes.min(), actor_1_fb_likes.max(), \
actor_1_fb_likes.mean(), actor_1_fb_likes.median(), \
actor_1_fb_likes.std(), actor_1_fb_likes.sum()

In [None]:
actor_1_fb_likes.describe()

In [None]:
director.describe()

In [None]:
actor_1_fb_likes.quantile(.2)

In [None]:
actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

In [None]:
director.isnull()

In [None]:
actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)
actor_1_fb_likes_filled.count()

In [None]:
actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()
actor_1_fb_likes_dropped.size

## There's more...

In [None]:
director.value_counts(normalize=True)

In [None]:
director.hasnans

In [None]:
director.notnull()

# Working with operators on a Series

In [None]:
pd.options.display.max_rows = 6

In [None]:
5 + 9    # plus operator example. Adds 5 and 9

In [None]:
4 ** 2   # exponentiation operator. Raises 4 to the second power

In [None]:
a = 10   # assignment operator.

In [None]:
5 <= 9   # less than or equal to operator

In [None]:
'abcde' + 'fg'    # plus operator for strings. C

In [None]:
not (5 <= 9)      # not is an operator that is a reserved keyword and reverse a boolean

In [None]:
7 in [1, 2, 6]    # in operator checks for membership of a list

In [None]:
set([1,2,3]) & set([2,3,4])

In [None]:
[1, 2, 3] - 3

In [None]:
a = set([1,2,3])     
a[0]                 # the indexing operator does not work with sets

## Getting ready...

In [None]:
movie = pd.read_csv('data/movie.csv')
imdb_score = movie['imdb_score']
imdb_score

In [None]:
imdb_score + 1

In [None]:
imdb_score * 2.5

In [None]:
imdb_score // 7

In [None]:
imdb_score > 7

In [None]:
director = movie['director_name']

In [None]:
director == 'James Cameron'

## There's more...

In [None]:
imdb_score.add(1)              # imdb_score + 1

In [None]:
imdb_score.mul(2.5)            # imdb_score * 2.5

In [None]:
imdb_score.floordiv(7)         # imdb_score // 7

In [None]:
imdb_score.gt(7)               # imdb_score > 7

In [None]:
director.eq('James Cameron')   # director == 'James Cameron'

In [None]:
imdb_score.astype(int).mod(5)

In [None]:
a = type(1)

In [None]:
type(a)

In [None]:
a = type(imdb_score)

In [None]:
a([1,2,3])

# Chaining Series methods together

In [None]:
movie = pd.read_csv('data/movie.csv')
actor_1_fb_likes = movie['actor_1_facebook_likes']
director = movie['director_name']

In [None]:
director.value_counts().head(3)

In [None]:
actor_1_fb_likes.isnull().sum()

In [None]:
actor_1_fb_likes.dtype

In [None]:
actor_1_fb_likes.fillna(0)\
                .astype(int)\
                .head()

## There's more...

In [None]:
actor_1_fb_likes.isnull().mean()

In [None]:
(actor_1_fb_likes.fillna(0)
                 .astype(int)
                 .head())

# Making the index meaningful

In [None]:
movie = pd.read_csv('data/movie.csv')

In [None]:
movie.shape

In [None]:
movie2 = movie.set_index('movie_title')
movie2

In [None]:
pd.read_csv('data/movie.csv', index_col='movie_title')

# There's more...

In [None]:
movie2.reset_index()

# Renaming row and column names

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')

In [None]:
idx_rename = {'Avatar':'Ratava', 'Spectre': 'Ertceps'} 
col_rename = {'director_name':'Director Name', 
              'num_critic_for_reviews': 'Critical Reviews'} 

In [None]:
movie.rename(index=idx_rename, 
             columns=col_rename).head()

# There's more

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
index = movie.index
columns = movie.columns

index_list = index.tolist()
column_list = columns.tolist()

index_list[0] = 'Ratava'
index_list[2] = 'Ertceps'
column_list[1] = 'Director Name'
column_list[2] = 'Critical Reviews'

In [None]:
print(index_list[:5])

In [None]:
print(column_list)

In [None]:
movie.index = index_list
movie.columns = column_list

In [None]:
movie.head()

# Creating and deleting columns

In [None]:
movie = pd.read_csv('data/movie.csv')

In [None]:
movie['has_seen'] = 0

In [None]:
movie.columns

In [None]:
movie['actor_director_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                              movie['actor_2_facebook_likes'] + 
                                              movie['actor_3_facebook_likes'] + 
                                              movie['director_facebook_likes'])

In [None]:
movie['actor_director_facebook_likes'].isnull().sum()

In [None]:
movie['actor_director_facebook_likes'] = movie['actor_director_facebook_likes'].fillna(0)

In [None]:
movie['is_cast_likes_more'] = (movie['cast_total_facebook_likes'] >= 
                                  movie['actor_director_facebook_likes'])

In [None]:
movie['is_cast_likes_more'].all()

In [None]:
movie = movie.drop('actor_director_facebook_likes', axis='columns')

In [None]:
movie['actor_total_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                       movie['actor_2_facebook_likes'] + 
                                       movie['actor_3_facebook_likes'])

movie['actor_total_facebook_likes'] = movie['actor_total_facebook_likes'].fillna(0)

In [None]:
movie['is_cast_likes_more'] = movie['cast_total_facebook_likes'] >= \
                                  movie['actor_total_facebook_likes']
    
movie['is_cast_likes_more'].all()

In [None]:
movie['pct_actor_cast_like'] = (movie['actor_total_facebook_likes'] / 
                                movie['cast_total_facebook_likes'])

In [None]:
movie['pct_actor_cast_like'].min(), movie['pct_actor_cast_like'].max() 

In [None]:
movie.set_index('movie_title')['pct_actor_cast_like'].head()

## There's more...

In [None]:
profit_index = movie.columns.get_loc('gross') + 1
profit_index

In [None]:
movie.insert(loc=profit_index,
                 column='profit',
                 value=movie['gross'] - movie['budget'])

In [None]:
movie.head()