# Pandas

* Pandas is an open source library built on top of NumPy
* It allows for fast analysis and data cleaning and preparation 
* It excels in performance and productivity
* It has built-in visualization features
* It can work with data from a wide variety of sources

### Installing pandas 
`conda install pandas`

In [1]:
#import 
import numpy as np
import pandas as pd

### Pandas Series

#####  Creating a Pandas Series

In [2]:
# Labeled indices
labels = ['apartment', 'apartment', 'house', 'dorm', 'dorm', 'townhouse', 'apartment', 'townhouse', 'apartment','compound', 'compound']
# Set of unlabeled data
my_data = ['Rawda1', 'Rawda2', 'Naim', 'Rieber', 'Hedrick', 'Palazzo', 'Atrium1', 'AtriumFoy', 'Atrium3', 'Venice', 'Soma'] 

# Create a pandas series with the data and corresponding labeled indices
pd.Series(data= my_data, index = labels)

apartment       Rawda1
apartment       Rawda2
house             Naim
dorm            Rieber
dorm           Hedrick
townhouse      Palazzo
apartment      Atrium1
townhouse    AtriumFoy
apartment      Atrium3
compound        Venice
compound          Soma
dtype: object

In [3]:
# you can also use a numpy array as the data or index
labels = np.array(['first', 'Second', 'Third', 'Fourth'])
my_data = np.array(['Rayan', 'Sara', 'Salma', 'Omar'])

pd.Series(my_data,labels)

first     Rayan
Second     Sara
Third     Salma
Fourth     Omar
dtype: object

In [4]:
# You can also create a Series from a dictionary (key becomes index, value becomes data point)
labeled_data = {'Snoop': 'Dogg', 'Wiz': 'Khalifa', 'Mac': 'Miller', 'Lil': 'Wayne', 'Young': 'Thug'}
pd.Series(labeled_data)

Snoop       Dogg
Wiz      Khalifa
Mac       Miller
Lil        Wayne
Young       Thug
dtype: object

##### Pandas Series can hold a variety of data types

In [5]:
# e.g. int, string, functions, lists
pd.Series([1, 'This is a string', sum,print,len, ['l1',12,'ok']])

0                            1
1             This is a string
2      <built-in function sum>
3    <built-in function print>
4      <built-in function len>
5                 [l1, 12, ok]
dtype: object

###### Getting data from a Pandas Series

In [6]:
# You can call particular data points in the series using the index

# Create Series
crimesAgainstHumanity = {'East Asia':'Uighurs', 'South Asia':'Rohingyas', 'Levant':'Palestinians', 'Arabian Gulf':'Yemenis'}
refugees = pd.Series(crimesAgainstHumanity)

# Retrieve data point from labeled index
refugees['South Asia']

'Rohingyas'

##### Operations with Pandas Series

In [7]:
# With Numbers

# First Series
learn = pd.Series({'podcasts': 1, 'books': 3, 'audiobooks': 2, 'videos': 5, 'courses': 10})
# Second Series
learn2 = pd.Series({'books': 1, 'audiobooks': 4, 'videos': 15, 'courses': 3}) # Where there's no matching index, operations return NaN
# Sum them up 
print(learn + learn2)
# Subtract them 
print(learn - learn2)
# Mutliply them 
print(learn * learn2)
# Divide them
print(learn / learn2)
# Get the remainder
print(learn % learn2)
# Raise to the power
print(learn ** learn2)
# ...

audiobooks     6.0
books          4.0
courses       13.0
podcasts       NaN
videos        20.0
dtype: float64
audiobooks    -2.0
books          2.0
courses        7.0
podcasts       NaN
videos       -10.0
dtype: float64
audiobooks     8.0
books          3.0
courses       30.0
podcasts       NaN
videos        75.0
dtype: float64
audiobooks    0.500000
books         3.000000
courses       3.333333
podcasts           NaN
videos        0.333333
dtype: float64
audiobooks    2.0
books         0.0
courses       1.0
podcasts      NaN
videos        5.0
dtype: float64
audiobooks    1.600000e+01
books         3.000000e+00
courses       1.000000e+03
podcasts      1.000000e+00
videos        3.051758e+10
dtype: float64


In [8]:
# Operating with strings

# First Series
ai = pd.Series({'podcast': 'Lex Fridman', 'book': 'Homo Deus', 'competition': 'XPRIZE', 'deep learning': 'Geoffrey Hinton'})
# Second Series
ai2 = pd.Series({'podcast': 'Talking Machines', 'book': 'The Deep Learning Revolution', 'competition': 'AI for GOOD', 'deep learning': 'Yoshua Bengio'})
# Concatenate 
ai + ', ' + ai2

podcast                    Lex Fridman, Talking Machines
book             Homo Deus, The Deep Learning Revolution
competition                          XPRIZE, AI for GOOD
deep learning             Geoffrey Hinton, Yoshua Bengio
dtype: object

### Pandas Dataframes

##### Creating a pandas df

In [9]:
from numpy.random import randn

moneyball = pd.DataFrame(abs(randn(5,4))/2,['Batting Avg','Hit Percentage','Slugging Percentage','On Base Percentage', 'Home Runs'], ['Christian Yelich', 'Mookie Betts', 'Mike Trout', 'Cody Bellinger'])
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger
Batting Avg,0.550215,0.424384,0.464826,0.591292
Hit Percentage,0.203347,0.038523,0.858152,0.233722
Slugging Percentage,0.492254,0.693949,0.032256,0.05907
On Base Percentage,0.104318,0.36276,0.503211,0.431501
Home Runs,0.047997,0.344066,0.24336,0.966386


###### Getting data from a pandas df

In [10]:
# Getting a column returns a series

# Get the stats for Mike Trout
moneyball['Mike Trout']

Batting Avg            0.464826
Hit Percentage         0.858152
Slugging Percentage    0.032256
On Base Percentage     0.503211
Home Runs              0.243360
Name: Mike Trout, dtype: float64

In [11]:
# Getting multiple columns returns a dataframe

# Get the stats for Yelich and Bellinger
moneyball[['Christian Yelich', 'Cody Bellinger']] # Need to pass in the column names as a list in the index

Unnamed: 0,Christian Yelich,Cody Bellinger
Batting Avg,0.550215,0.591292
Hit Percentage,0.203347,0.233722
Slugging Percentage,0.492254,0.05907
On Base Percentage,0.104318,0.431501
Home Runs,0.047997,0.966386


In [12]:
# Getting 1 row's data returns a series

moneyball.loc['Hit Percentage']

Christian Yelich    0.203347
Mookie Betts        0.038523
Mike Trout          0.858152
Cody Bellinger      0.233722
Name: Hit Percentage, dtype: float64

In [13]:
# Getting mutliple rows' data returns a dataframe

moneyball.loc[['Hit Percentage', 'Batting Avg']]

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger
Hit Percentage,0.203347,0.038523,0.858152,0.233722
Batting Avg,0.550215,0.424384,0.464826,0.591292


In [14]:
# You can also select rows from their numerical index
moneyball.iloc[[1, 3]]

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger
Hit Percentage,0.203347,0.038523,0.858152,0.233722
On Base Percentage,0.104318,0.36276,0.503211,0.431501


###### Adding new data to a dataframe

In [15]:
# Add a new Column

moneyball['Jose Ramirez'] = abs(np.random.randn(5))/2
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez
Batting Avg,0.550215,0.424384,0.464826,0.591292,0.928418
Hit Percentage,0.203347,0.038523,0.858152,0.233722,-0.838135
Slugging Percentage,0.492254,0.693949,0.032256,0.05907,-1.606647
On Base Percentage,0.104318,0.36276,0.503211,0.431501,0.394476
Home Runs,0.047997,0.344066,0.24336,0.966386,-0.016065


In [16]:
# Add a new row
moneyball.loc['On base plus slug'] = moneyball.loc['On Base Percentage'] + moneyball.loc['Slugging Percentage']
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez
Batting Avg,0.550215,0.424384,0.464826,0.591292,0.928418
Hit Percentage,0.203347,0.038523,0.858152,0.233722,-0.838135
Slugging Percentage,0.492254,0.693949,0.032256,0.05907,-1.606647
On Base Percentage,0.104318,0.36276,0.503211,0.431501,0.394476
Home Runs,0.047997,0.344066,0.24336,0.966386,-0.016065
On base plus slug,0.596573,1.056709,0.535467,0.490572,-1.21217


In [17]:
# Adding columns with arithmetic 

moneyball['Dodgers'] = (moneyball['Mookie Betts'] + moneyball['Cody Bellinger']) / 2
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez,Dodgers
Batting Avg,0.550215,0.424384,0.464826,0.591292,0.928418,0.507838
Hit Percentage,0.203347,0.038523,0.858152,0.233722,-0.838135,0.136123
Slugging Percentage,0.492254,0.693949,0.032256,0.05907,-1.606647,0.37651
On Base Percentage,0.104318,0.36276,0.503211,0.431501,0.394476,0.397131
Home Runs,0.047997,0.344066,0.24336,0.966386,-0.016065,0.655226
On base plus slug,0.596573,1.056709,0.535467,0.490572,-1.21217,0.77364


##### Removing data from a dataframe

In [18]:
# Disregard a row
moneyball.drop('Home Runs')

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez,Dodgers
Batting Avg,0.550215,0.424384,0.464826,0.591292,0.928418,0.507838
Hit Percentage,0.203347,0.038523,0.858152,0.233722,-0.838135,0.136123
Slugging Percentage,0.492254,0.693949,0.032256,0.05907,-1.606647,0.37651
On Base Percentage,0.104318,0.36276,0.503211,0.431501,0.394476,0.397131
On base plus slug,0.596573,1.056709,0.535467,0.490572,-1.21217,0.77364


In [19]:
# Note: The dropped row is still in the actual dataframe
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez,Dodgers
Batting Avg,0.550215,0.424384,0.464826,0.591292,0.928418,0.507838
Hit Percentage,0.203347,0.038523,0.858152,0.233722,-0.838135,0.136123
Slugging Percentage,0.492254,0.693949,0.032256,0.05907,-1.606647,0.37651
On Base Percentage,0.104318,0.36276,0.503211,0.431501,0.394476,0.397131
Home Runs,0.047997,0.344066,0.24336,0.966386,-0.016065,0.655226
On base plus slug,0.596573,1.056709,0.535467,0.490572,-1.21217,0.77364


In [20]:
# You have to specify that you want to remove it in place (pandas safety feature for not losing data)
moneyball.drop('Home Runs', axis=0, inplace=True)
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez,Dodgers
Batting Avg,0.550215,0.424384,0.464826,0.591292,0.928418,0.507838
Hit Percentage,0.203347,0.038523,0.858152,0.233722,-0.838135,0.136123
Slugging Percentage,0.492254,0.693949,0.032256,0.05907,-1.606647,0.37651
On Base Percentage,0.104318,0.36276,0.503211,0.431501,0.394476,0.397131
On base plus slug,0.596573,1.056709,0.535467,0.490572,-1.21217,0.77364


In [21]:
# Disregard a column

# Specify axis = 1 for columns
moneyball.drop('Dodgers', axis=1)

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez
Batting Avg,0.550215,0.424384,0.464826,0.591292,0.928418
Hit Percentage,0.203347,0.038523,0.858152,0.233722,-0.838135
Slugging Percentage,0.492254,0.693949,0.032256,0.05907,-1.606647
On Base Percentage,0.104318,0.36276,0.503211,0.431501,0.394476
On base plus slug,0.596573,1.056709,0.535467,0.490572,-1.21217


In [22]:
# Dropped column still in dataframe

moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez,Dodgers
Batting Avg,0.550215,0.424384,0.464826,0.591292,0.928418,0.507838
Hit Percentage,0.203347,0.038523,0.858152,0.233722,-0.838135,0.136123
Slugging Percentage,0.492254,0.693949,0.032256,0.05907,-1.606647,0.37651
On Base Percentage,0.104318,0.36276,0.503211,0.431501,0.394476,0.397131
On base plus slug,0.596573,1.056709,0.535467,0.490572,-1.21217,0.77364


In [23]:
# Remove the column data completely 
moneyball.drop('Dodgers', axis=1, inplace=True)
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez
Batting Avg,0.550215,0.424384,0.464826,0.591292,0.928418
Hit Percentage,0.203347,0.038523,0.858152,0.233722,-0.838135
Slugging Percentage,0.492254,0.693949,0.032256,0.05907,-1.606647
On Base Percentage,0.104318,0.36276,0.503211,0.431501,0.394476
On base plus slug,0.596573,1.056709,0.535467,0.490572,-1.21217


In [24]:
# Get the dataframe's shape (rows, cols)
moneyball.shape

(5, 5)

##### Getting specific subsets of the dataframe

In [25]:
# Get Cody Bellinger's hit percentage
moneyball.loc['Hit Percentage', 'Cody Bellinger']

0.2337219488281241

In [26]:
# Get Mike Trout, Jose Ramirez, and Mookie Betts' on base percentage, hit percentage, and slugging percentages
# Syntax df.loc[rows,cols]
moneyball.loc[['Hit Percentage','Slugging Percentage','On Base Percentage'], ['Mike Trout', 'Jose Ramirez', 'Mookie Betts']]

Unnamed: 0,Mike Trout,Jose Ramirez,Mookie Betts
Hit Percentage,0.858152,-0.838135,0.038523
Slugging Percentage,0.032256,-1.606647,0.693949
On Base Percentage,0.503211,0.394476,0.36276
