# Pandas

* Pandas is an open source library built on top of NumPy
* It allows for fast analysis and data cleaning and preparation 
* It excels in performance and productivity
* It has built-in visualization features
* It can work with data from a wide variety of sources

### Installing pandas 
`conda install pandas`

In [1]:
#import 
import numpy as np
import pandas as pd

### Pandas Series

#####  Creating a Pandas Series

In [2]:
# Labeled indices
labels = ['apartment', 'apartment', 'house', 'dorm', 'dorm', 'townhouse', 'apartment', 'townhouse', 'apartment','compound', 'compound']
# Set of unlabeled data
my_data = ['Rawda1', 'Rawda2', 'Naim', 'Rieber', 'Hedrick', 'Palazzo', 'Atrium1', 'AtriumFoy', 'Atrium3', 'Venice', 'Soma'] 

# Create a pandas series with the data and corresponding labeled indices
pd.Series(data= my_data, index = labels)

apartment       Rawda1
apartment       Rawda2
house             Naim
dorm            Rieber
dorm           Hedrick
townhouse      Palazzo
apartment      Atrium1
townhouse    AtriumFoy
apartment      Atrium3
compound        Venice
compound          Soma
dtype: object

In [3]:
# you can also use a numpy array as the data or index
labels = np.array(['first', 'Second', 'Third', 'Fourth'])
my_data = np.array(['Rayan', 'Sara', 'Salma', 'Omar'])

pd.Series(my_data,labels)

first     Rayan
Second     Sara
Third     Salma
Fourth     Omar
dtype: object

In [4]:
# You can also create a Series from a dictionary (key becomes index, value becomes data point)
labeled_data = {'Snoop': 'Dogg', 'Wiz': 'Khalifa', 'Mac': 'Miller', 'Lil': 'Wayne', 'Young': 'Thug'}
pd.Series(labeled_data)

Snoop       Dogg
Wiz      Khalifa
Mac       Miller
Lil        Wayne
Young       Thug
dtype: object

##### Pandas Series can hold a variety of data types

In [5]:
# e.g. int, string, functions, lists
pd.Series([1, 'This is a string', sum,print,len, ['l1',12,'ok']])

0                            1
1             This is a string
2      <built-in function sum>
3    <built-in function print>
4      <built-in function len>
5                 [l1, 12, ok]
dtype: object

###### Getting data from a Pandas Series

In [6]:
# You can call particular data points in the series using the index

# Create Series
crimesAgainstHumanity = {'East Asia':'Uighurs', 'South Asia':'Rohingyas', 'Levant':'Palestinians', 'Arabian Gulf':'Yemenis'}
refugees = pd.Series(crimesAgainstHumanity)

# Retrieve data point from labeled index
refugees['South Asia']

'Rohingyas'

##### Operations with Pandas Series

In [7]:
# With Numbers

# First Series
learn = pd.Series({'podcasts': 1, 'books': 3, 'audiobooks': 2, 'videos': 5, 'courses': 10})
# Second Series
learn2 = pd.Series({'books': 1, 'audiobooks': 4, 'videos': 15, 'courses': 3}) # Where there's no matching index, operations return NaN
# Sum them up 
print(learn + learn2)
# Subtract them 
print(learn - learn2)
# Mutliply them 
print(learn * learn2)
# Divide them
print(learn / learn2)
# Get the remainder
print(learn % learn2)
# Raise to the power
print(learn ** learn2)
# ...

audiobooks     6.0
books          4.0
courses       13.0
podcasts       NaN
videos        20.0
dtype: float64
audiobooks    -2.0
books          2.0
courses        7.0
podcasts       NaN
videos       -10.0
dtype: float64
audiobooks     8.0
books          3.0
courses       30.0
podcasts       NaN
videos        75.0
dtype: float64
audiobooks    0.500000
books         3.000000
courses       3.333333
podcasts           NaN
videos        0.333333
dtype: float64
audiobooks    2.0
books         0.0
courses       1.0
podcasts      NaN
videos        5.0
dtype: float64
audiobooks    1.600000e+01
books         3.000000e+00
courses       1.000000e+03
podcasts      1.000000e+00
videos        3.051758e+10
dtype: float64


In [8]:
# Operating with strings

# First Series
ai = pd.Series({'podcast': 'Lex Fridman', 'book': 'Homo Deus', 'competition': 'XPRIZE', 'deep learning': 'Geoffrey Hinton'})
# Second Series
ai2 = pd.Series({'podcast': 'Talking Machines', 'book': 'The Deep Learning Revolution', 'competition': 'AI for GOOD', 'deep learning': 'Yoshua Bengio'})
# Concatenate 
ai + ', ' + ai2

podcast                    Lex Fridman, Talking Machines
book             Homo Deus, The Deep Learning Revolution
competition                          XPRIZE, AI for GOOD
deep learning             Geoffrey Hinton, Yoshua Bengio
dtype: object

### Pandas Dataframes

##### Creating a pandas df

In [9]:
from numpy.random import randn

moneyball = pd.DataFrame(abs(randn(5,4))/2,['Batting Avg','Hit Percentage','Slugging Percentage','On Base Percentage', 'Home Runs'], ['Christian Yelich', 'Mookie Betts', 'Mike Trout', 'Cody Bellinger'])
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger
Batting Avg,0.246872,0.495953,0.241838,0.116918
Hit Percentage,0.489715,0.279867,0.061368,0.047368
Slugging Percentage,0.192616,0.677293,0.51145,0.195272
On Base Percentage,0.121835,0.055723,0.355265,0.185234
Home Runs,0.28283,0.024767,0.807916,0.329563


###### Getting data from a pandas df

In [10]:
# Getting a column returns a series

# Get the stats for Mike Trout
moneyball['Mike Trout']

Batting Avg            0.241838
Hit Percentage         0.061368
Slugging Percentage    0.511450
On Base Percentage     0.355265
Home Runs              0.807916
Name: Mike Trout, dtype: float64

In [11]:
# Getting multiple columns returns a dataframe

# Get the stats for Yelich and Bellinger
moneyball[['Christian Yelich', 'Cody Bellinger']] # Need to pass in the column names as a list in the index

Unnamed: 0,Christian Yelich,Cody Bellinger
Batting Avg,0.246872,0.116918
Hit Percentage,0.489715,0.047368
Slugging Percentage,0.192616,0.195272
On Base Percentage,0.121835,0.185234
Home Runs,0.28283,0.329563


In [12]:
# Getting 1 row's data returns a series

moneyball.loc['Hit Percentage']

Christian Yelich    0.489715
Mookie Betts        0.279867
Mike Trout          0.061368
Cody Bellinger      0.047368
Name: Hit Percentage, dtype: float64

In [13]:
# Getting mutliple rows' data returns a dataframe

moneyball.loc[['Hit Percentage', 'Batting Avg']]

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger
Hit Percentage,0.489715,0.279867,0.061368,0.047368
Batting Avg,0.246872,0.495953,0.241838,0.116918


In [14]:
# You can also select rows from their numerical index
moneyball.iloc[[1, 3]]

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger
Hit Percentage,0.489715,0.279867,0.061368,0.047368
On Base Percentage,0.121835,0.055723,0.355265,0.185234


###### Adding new data to a dataframe

In [15]:
# Add a new Column

moneyball['Jose Ramirez'] = abs(np.random.randn(5))/2
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez
Batting Avg,0.246872,0.495953,0.241838,0.116918,0.872646
Hit Percentage,0.489715,0.279867,0.061368,0.047368,0.575717
Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565
On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109
Home Runs,0.28283,0.024767,0.807916,0.329563,0.179312


In [16]:
# Add a new row
moneyball.loc['On base plus slug'] = moneyball.loc['On Base Percentage'] + moneyball.loc['Slugging Percentage']
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez
Batting Avg,0.246872,0.495953,0.241838,0.116918,0.872646
Hit Percentage,0.489715,0.279867,0.061368,0.047368,0.575717
Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565
On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109
Home Runs,0.28283,0.024767,0.807916,0.329563,0.179312
On base plus slug,0.314451,0.733015,0.866715,0.380506,0.849673


In [17]:
# Adding columns with arithmetic 

moneyball['Dodgers'] = (moneyball['Mookie Betts'] + moneyball['Cody Bellinger']) / 2
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez,Dodgers
Batting Avg,0.246872,0.495953,0.241838,0.116918,0.872646,0.306436
Hit Percentage,0.489715,0.279867,0.061368,0.047368,0.575717,0.163618
Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565,0.436282
On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109,0.120478
Home Runs,0.28283,0.024767,0.807916,0.329563,0.179312,0.177165
On base plus slug,0.314451,0.733015,0.866715,0.380506,0.849673,0.55676


##### Removing data from a dataframe

In [18]:
# Disregard a row
moneyball.drop('Home Runs')

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez,Dodgers
Batting Avg,0.246872,0.495953,0.241838,0.116918,0.872646,0.306436
Hit Percentage,0.489715,0.279867,0.061368,0.047368,0.575717,0.163618
Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565,0.436282
On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109,0.120478
On base plus slug,0.314451,0.733015,0.866715,0.380506,0.849673,0.55676


In [19]:
# Note: The dropped row is still in the actual dataframe
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez,Dodgers
Batting Avg,0.246872,0.495953,0.241838,0.116918,0.872646,0.306436
Hit Percentage,0.489715,0.279867,0.061368,0.047368,0.575717,0.163618
Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565,0.436282
On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109,0.120478
Home Runs,0.28283,0.024767,0.807916,0.329563,0.179312,0.177165
On base plus slug,0.314451,0.733015,0.866715,0.380506,0.849673,0.55676


In [20]:
# You have to specify that you want to remove it in place (pandas safety feature for not losing data)
moneyball.drop('Home Runs', axis=0, inplace=True)
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez,Dodgers
Batting Avg,0.246872,0.495953,0.241838,0.116918,0.872646,0.306436
Hit Percentage,0.489715,0.279867,0.061368,0.047368,0.575717,0.163618
Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565,0.436282
On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109,0.120478
On base plus slug,0.314451,0.733015,0.866715,0.380506,0.849673,0.55676


In [21]:
# Disregard a column

# Specify axis = 1 for columns
moneyball.drop('Dodgers', axis=1)

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez
Batting Avg,0.246872,0.495953,0.241838,0.116918,0.872646
Hit Percentage,0.489715,0.279867,0.061368,0.047368,0.575717
Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565
On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109
On base plus slug,0.314451,0.733015,0.866715,0.380506,0.849673


In [22]:
# Dropped column still in dataframe

moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez,Dodgers
Batting Avg,0.246872,0.495953,0.241838,0.116918,0.872646,0.306436
Hit Percentage,0.489715,0.279867,0.061368,0.047368,0.575717,0.163618
Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565,0.436282
On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109,0.120478
On base plus slug,0.314451,0.733015,0.866715,0.380506,0.849673,0.55676


In [23]:
# Remove the column data completely 
moneyball.drop('Dodgers', axis=1, inplace=True)
moneyball

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez
Batting Avg,0.246872,0.495953,0.241838,0.116918,0.872646
Hit Percentage,0.489715,0.279867,0.061368,0.047368,0.575717
Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565
On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109
On base plus slug,0.314451,0.733015,0.866715,0.380506,0.849673


In [24]:
# Get the dataframe's shape (rows, cols)
moneyball.shape

(5, 5)

##### Getting specific subsets of the dataframe

In [25]:
# Get Cody Bellinger's hit percentage
moneyball.loc['Hit Percentage', 'Cody Bellinger']

0.0473683366447716

In [26]:
# Get Mike Trout, Jose Ramirez, and Mookie Betts' on base percentage, hit percentage, and slugging percentages
# Syntax df.loc[rows,cols]
moneyball.loc[['Hit Percentage','Slugging Percentage','On Base Percentage'], ['Mike Trout', 'Jose Ramirez', 'Mookie Betts']]

Unnamed: 0,Mike Trout,Jose Ramirez,Mookie Betts
Hit Percentage,0.061368,0.575717,0.279867
Slugging Percentage,0.51145,0.671565,0.677293
On Base Percentage,0.355265,0.178109,0.055723


##### Conditional Selection

In [27]:
# Only select the stats where Mike Trout scores better than 0.3
goodStats = moneyball[moneyball['Mike Trout'] > 0.3] # Batting average and On base plus slug
goodStats

Unnamed: 0,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez
Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565
On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109
On base plus slug,0.314451,0.733015,0.866715,0.380506,0.849673


In [28]:
# You can select certain items from a conditionally selected subset of the df

# See how Yelich and Betts score on batting average and obp
goodStats[['Christian Yelich', 'Mookie Betts']]

Unnamed: 0,Christian Yelich,Mookie Betts
Slugging Percentage,0.192616,0.677293
On Base Percentage,0.121835,0.055723
On base plus slug,0.314451,0.733015


In [29]:
# You can stack the conditional and index selection commands 

# Mike Trout's stats where Mike Trout scores less than 0.3
moneyball[moneyball['Mike Trout'] < 0.3]['Mike Trout']

Batting Avg       0.241838
Hit Percentage    0.061368
Name: Mike Trout, dtype: float64

In [30]:
# See where Ramirez and Bellinger score more than 0.3
moneyball[moneyball[['Cody Bellinger', 'Jose Ramirez']] > 0.3][['Jose Ramirez', 'Cody Bellinger']]

Unnamed: 0,Jose Ramirez,Cody Bellinger
Batting Avg,0.872646,
Hit Percentage,0.575717,
Slugging Percentage,0.671565,
On Base Percentage,,
On base plus slug,0.849673,0.380506


##### Conditional Selection with multiple conditions 

Use `&` or `|`

In [31]:
# See where Cody Bellinger does well AND Mike Trout does poorly
moneyball[(moneyball['Mike Trout'] < 0.3) & (moneyball['Cody Bellinger'] > 0.3)][['Cody Bellinger', 'Mike Trout']]

Unnamed: 0,Cody Bellinger,Mike Trout


In [32]:
# Select the best batter for the Dodgers: See where Cody Bellinger does well or Mookie Betts does well
moneyball[(moneyball['Mookie Betts'] > 0.3) | (moneyball['Cody Bellinger'] > 0.3)][['Cody Bellinger', 'Mookie Betts']]

Unnamed: 0,Cody Bellinger,Mookie Betts
Batting Avg,0.116918,0.495953
Slugging Percentage,0.195272,0.677293
On base plus slug,0.380506,0.733015


##### Resetting the labeled indices to a column

In [33]:
# Put the labels in a column
moneyball.reset_index() # This doesn't occur in place unless you specify inplace=True

Unnamed: 0,index,Christian Yelich,Mookie Betts,Mike Trout,Cody Bellinger,Jose Ramirez
0,Batting Avg,0.246872,0.495953,0.241838,0.116918,0.872646
1,Hit Percentage,0.489715,0.279867,0.061368,0.047368,0.575717
2,Slugging Percentage,0.192616,0.677293,0.51145,0.195272,0.671565
3,On Base Percentage,0.121835,0.055723,0.355265,0.185234,0.178109
4,On base plus slug,0.314451,0.733015,0.866715,0.380506,0.849673


In [34]:
# Example of resetting and setting the index

# Create a dataframe
songs = ['Alf Leila W Leila', 'Ah W Noss', 'Mahragan Bent El Geran', 'Desert Rose', 'From Jeddah to LA']
artists = ['Umm Kulthum', 'Nancy Ajram', 'Hassan Shakosh', 'Sting', 'Qusai']
df = pd.DataFrame(songs, artists, ['Song'])
df

Unnamed: 0,Song
Umm Kulthum,Alf Leila W Leila
Nancy Ajram,Ah W Noss
Hassan Shakosh,Mahragan Bent El Geran
Sting,Desert Rose
Qusai,From Jeddah to LA


In [35]:
# Reset the index to 0,1,2,3...
df.reset_index(inplace=True)

In [36]:
# Add a column
df['Genre'] = ['Classical Egyptian', 'Arabic Pop', 'Sha3by', 'Rock', 'Hip Hop']
# Set the column data as the indices
df.set_index('Genre')

Unnamed: 0_level_0,index,Song
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
Classical Egyptian,Umm Kulthum,Alf Leila W Leila
Arabic Pop,Nancy Ajram,Ah W Noss
Sha3by,Hassan Shakosh,Mahragan Bent El Geran
Rock,Sting,Desert Rose
Hip Hop,Qusai,From Jeddah to LA


In [37]:
# Just for clarity, rename the old index data to what they represent
df['Artist'] = df['index']
df.drop('index', axis=1, inplace=True)
df

Unnamed: 0,Song,genres,Artist
0,Alf Leila W Leila,Classical Egyptian,Umm Kulthum
1,Ah W Noss,Arabic Pop,Nancy Ajram
2,Mahragan Bent El Geran,Sha3by,Hassan Shakosh
3,Desert Rose,Rock,Sting
4,From Jeddah to LA,Hip Hop,Qusai
