# Introduction to Pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'1.0.3'

## Series Datatype

**Series can be declared through list, scalar, dict and ndarray**

In [13]:
# Declare via list
a = pd.Series([9, 8, 7, 6])# Series auto-index
a, a.dtype #int 64 by default

(0    9
 1    8
 2    7
 3    6
 dtype: int64, dtype('int64'))

In [47]:
# Specify Index
b = pd.Series([9, 8, 7, 6], index = ['a','b','c','d'], name = "From list")
b

a    9
b    8
c    7
d    6
Name: From list, dtype: int64

In [46]:
#Declare via a scalar
#Index has to be specified 
s = pd.Series(25, index = ['a', 'b', 'c'], name = "From a single value")
s

a    25
b    25
c    25
Name: From a single value, dtype: int64

In [45]:
#Declare from dict
d = pd.Series({'a':9, 'b':8, 'c':7})
print(d)
e = pd.Series({'a':9, 'b':8, 'c':7}, 
              index=['c', 'a', 'b', 'd'],
              name = "From Dict")#customize order
print(e) #e is float

a    9
b    8
c    7
dtype: int64
c    7.0
a    9.0
b    8.0
d    NaN
Name: From Dict, dtype: float64


In [51]:
#NaN is NOT None
#NaN is float
#Use np.isnan() to check
print(np.nan == None, type(np.nan), np.isnan(np.nan))

False <class 'float'> True


In [44]:
#Declare from ndarray
m = pd.Series(np.arange(5),
             name = 'From ndarry')
print(m)

0    0
1    1
2    2
3    3
4    4
Name: From ndarry, dtype: int64


In [43]:
#we can modify the index and names later
m.index = np.arange(9,4,-1)
m.index = m.index.rename('Named index', inplace=False) #default is false
print(m.index)
m.name = 'Name it here'
print(m)


Int64Index([9, 8, 7, 6, 5], dtype='int64', name='Named index')
Named index
9    0
8    1
7    2
6    3
5    4
Name: Name it here, dtype: int64


## Query of Series

In [52]:
b = pd.Series([1, 2, 3, 4], index= ['a', 'b', 'c', 'd'])
b

a    1
b    2
c    3
d    4
dtype: int64

In [53]:
b.values
#Value is a ndarray

array([1, 2, 3, 4])

### Select vs Slice

In [92]:
#select
b[0]

1

In [90]:
#slice
b[[0]]

a    1
dtype: int64

### Slice using user-index

In [81]:
b[['a','b','c']]

a    1
b    2
c    3
dtype: int64

In [87]:
b.loc[['a','b','c']]

a    1
b    2
c    3
dtype: int64

### Slice using auto-index

In [57]:
b[0:3]

a    1
b    2
c    3
dtype: int64

In [80]:
b.iloc[0:3]

a    1
b    2
c    3
dtype: int64

In [88]:
b.iloc[[1,2,3]]

b    2
c    3
d    4
dtype: int64

In [58]:
b[b > b.median()] #Logic selection

c    3
d    4
dtype: int64

In [59]:
np.exp(b) #still return Series

a     2.718282
b     7.389056
c    20.085537
d    54.598150
dtype: float64

## Math Operations

Use `%%timeit -n x` to run the cell x times and time the averge

In [112]:
s = pd.Series(np.random.randint(0,1000,1000))
s.head()

0    752
1    110
2    639
3    184
4    406
dtype: int64

In [126]:
%%timeit -n 100
summary = 0
for item in s:
    summary += item
#Slowest

189 µs ± 35.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [125]:
%%timeit -n 100
np.sum(s)
#Between

126 µs ± 44.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [124]:
%%timeit -n 100
s.sum()
#Fasted!

91.2 µs ± 19.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### Broadcasting vs Unpacking

In [128]:
#broadcasting map operations to every item
s+=2 
s.head()

0    756
1    114
2    643
3    188
4    410
dtype: int64

In [140]:
#avoid use unpacking. Very slow
%%timeit -n 10
for label, value in s.iteritems(): #zipped list
    s.loc[label]= value+2

63.8 ms ± 2.99 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### **Series shares some features as Dictionary**

In [61]:
print('c' in b) #search user-defined index -"key"
print(1 in b) # not default index

True
False


In [62]:
b.get('f', 100) #return b['f']; if Null, return 100

100

In [63]:
# Alignment 
a = pd.Series([1, 2, 3], ['c', 'd', 'e'])
b = pd.Series([9, 8, 7, 6], ['a', 'b', 'c', 'd'])

a + b #add only common index, retain others as NaN

a    NaN
b    NaN
c    8.0
d    8.0
e    NaN
dtype: float64

In [71]:
# "Name" 
b = pd.Series([9, 8, 7, 6], ['a', 'b', 'c', 'd'])
print('before:','\n', b)
# Give a name to Series and its Index
b.name = 'A Series Object'
b.index.name = 'A Index Object'
print('after:','\n', b)

before: 
 a    9
b    8
c    7
d    6
dtype: int64
after: 
 A Index Object
a    9
b    8
c    7
d    6
Name: A Series Object, dtype: int64


#### `.append()` to return a new series with appended data

## DataFrame Datatype

In [141]:
import pandas as pd
import numpy as np

**Dataframe can be declared via 2D ndarray, 1D ndarray, List, Dict, Tuple, Serires, or other Dataframe**

In [149]:
#Declare via 2D ndarray
d = pd.DataFrame(np.arange(10).reshape(2,5), index = ['R1','R2'])
d

Unnamed: 0,0,1,2,3,4
R1,0,1,2,3,4
R2,5,6,7,8,9


In [285]:
#Declare via 1D list in Dict (column-wise)
dt = {'C1': [1, 2, 3, 4],\
      'C2': [9, 8, 7, 6],\
      'C3': ['x', 'y', 'z','']}

#Dict Key -->column index
#List Index --> row index
#List element --> data
d = pd.DataFrame(dt, index = ['R1', 'R2', 'R3', 'R4'] ) #index is optional
d

Unnamed: 0,C1,C2,C3
R1,1,9,x
R2,2,8,y
R3,3,7,z
R4,4,6,


In [305]:
#Declare via 1D ndarray in Dict (column-wise)
dt = {'C1': pd.Series([1, 2, 3], index = ['R4','R3','R1']),\
      'C2': pd.Series([9, 8, 7, 6],index = ['R1','R2','R3','R4']),\
     'C3': pd.Series(['x', 'y', 'z'], index = ['R1','R2','R3'])}

#Dict Key -->column index
#Array Index --> row index
#Array element --> data
d = pd.DataFrame(dt)
d

Unnamed: 0,C1,C2,C3
R1,3.0,9,x
R2,,8,y
R3,2.0,7,z
R4,1.0,6,


In [154]:
d.index #Row index

Index(['R1', 'R2', 'R3', 'R4'], dtype='object')

In [155]:
d.columns #Column index

Index(['C1', 'C2'], dtype='object')

In [156]:
d.values #return tranposed ndarray 

array([[ 1.,  9.],
       [ 2.,  8.],
       [ 3.,  7.],
       [nan,  6.]])

In [3]:
import pandas as pd

In [6]:
#df from Series (row-wise)
#missing value friendly
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food', #missing Cost is OK.
                        }, name='Store 1') #can declare row index here or later
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50}, name='Store 2')
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00}, name='Store 3')
df = pd.DataFrame([purchase_1, purchase_2, purchase_3])
df

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0


In [359]:
#df from Dicts (row-wise)
#missing value friendly 
purchase_1 = {'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        }
purchase_2 = {'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50}
purchase_3 = {'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00}
df = pd.DataFrame([purchase_1, purchase_2, purchase_3])
df.index = ['Store 1', 'Store 2', 'Store 3']
df

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0


Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,22.5
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0


### Index Manipulation

In [307]:
d

Unnamed: 0,C1,C2,C3
R1,3.0,9,x
R2,,8,y
R3,2.0,7,z
R4,1.0,6,


**Index is Immutable. Index operations returns a new index**

In [86]:
d.index, d.columns
#row index, columns

(Index(['R1', 'R2', 'R3', 'R4'], dtype='object'),
 Index(['C1', 'C2', 'C3'], dtype='object'))

In [308]:
#re-oder the rows
d.reindex(index = ['R4', 'R3', 'R2', 'R1'])#returns a new df

Unnamed: 0,C1,C2,C3
R4,1.0,6,
R3,2.0,7,z
R2,,8,y
R1,3.0,9,x


In [364]:
#re-oder / add new rows/clumns
d.reindex(columns = ['C3','CNew' 'C2', 'C1'])#Row index

Unnamed: 0,C3,CNewC2,C1
R1,x,,3.0
R2,y,,
R3,z,,2.0
R4,,,1.0


In [312]:
#insert new column to a specific position
new_columns = d.columns.insert(1, 'CNew') #insert before old 2nd column
print(new_columns)#returns an index([])
new_df = d.reindex(columns = new_columns, fill_value = 100)
new_df

Index(['C1', 'New', 'C2', 'C3'], dtype='object')


Unnamed: 0,C1,New,C2,C3
R1,3.0,100,9,x
R2,,100,8,y
R3,2.0,100,7,z
R4,1.0,100,6,


In [314]:
#add new index to index
new_index = d.index.insert(1, 'RNew')
print(new_columns)#returns an index([])
new_df = d.reindex(index = new_index, fill_value = 100)
new_df

Index(['C1', 'New', 'C2', 'C3'], dtype='object')


Unnamed: 0,C1,C2,C3
R1,3.0,9,x
RNew,100.0,100,100
R2,,8,y
R3,2.0,7,z
R4,1.0,6,


In [367]:
#change names of index/columns
new_df = d.rename(index={'R1':'NewR1'})
new_df

Unnamed: 0,C1,C2,C3
NewR1,3.0,9,x
R2,,8,y
R3,2.0,7,z
R4,1.0,6,


In [316]:
#delete from index
new_index = d.index.delete(0)
print(new_columns)#returns an index([])
new_df = d.reindex(index = new_index)
new_df

Index(['C1', 'New', 'C2', 'C3'], dtype='object')


Unnamed: 0,C1,C2,C3
R2,,8,y
R3,2.0,7,z
R4,1.0,6,


### Select and Slice Dataframe

use `[ ] `for column, <br>use `.loc [ ] `or `.iloc [ ]`  for row.

#### Select columns vs row

In [179]:
#select one columns. 
df['Name'] #returns a Series with CIndex as name

Store 1    Chris
Store 2    Kevyn
Store 3    Vinod
Name: Name, dtype: object

In [12]:
#select multiple columns
df[['Item Purchased', 'Cost']]

Unnamed: 0,Item Purchased,Cost
Store 1,Dog Food,
Store 2,Kitty Litter,2.5
Store 3,Bird Seed,5.0


In [182]:
#select a row
df.loc['Store 1'] #returns a Series with RIndex as name

Name                 Chris
Item Purchased    Dog Food
Cost                  22.5
Name: Store 1, dtype: object

In [184]:
#select multiple rows
df.loc[['Store 2','Store 3']]

Unnamed: 0,Name,Item Purchased,Cost
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0


#### `.iloc[ ]` for auto row index####

In [191]:
#select
df.iloc[1]

Name                     Kevyn
Item Purchased    Kitty Litter
Cost                       2.5
Name: Store 2, dtype: object

In [190]:
#multi select
df.iloc[[1,2]]

Unnamed: 0,Name,Item Purchased,Cost
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0


In [189]:
#slice
df.iloc[1:]

Unnamed: 0,Name,Item Purchased,Cost
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0


#### Select vs Slice

In [177]:
#select 
df.loc['Store 1'] #reduce to series with possible

Name                 Chris
Item Purchased    Dog Food
Cost                  22.5
Name: Store 1, dtype: object

In [171]:
#slice
df.loc[['Store 1']] #keep dataframe

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,22.5
Store 1,Kevyn,Kitty Litter,2.5


### Cross-sectional selection

only use `.loc[R, C]` for cross-sectional selection

In [213]:
#returns a reference to df
#change the reference can change df
df.loc['Store 1', 'Name']

'Chris'

In [216]:
#multiple selection
df.loc[:, ['Item Purchased', 'Cost']]

Unnamed: 0,Item Purchased,Cost
Store 1,Dog Food,22.5
Store 2,Kitty Litter,2.5
Store 3,Bird Seed,5.0


### Chained selection

In [201]:
#chained selection returns a copy of data.
#chagne the copy will NOT change the original df
df.loc['Store 1']['Cost'] = 0
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc['Store 1']['Cost'] = 0


Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,22.5
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0


In [37]:
try:
    d['R1'] 
except KeyError:
    print('Key Error:Error!!! \
    Because d[] only search keys/column\
    Use .loc[] or .iloc[] for Row')

Key Error:Error!!!     Because d[] only search keys/column


In [44]:
#Use .loc[] for row user-index
print(type(d.loc['R1']))#Series
d.loc['R1']


<class 'pandas.core.series.Series'>


C1    1
C2    9
Name: R1, dtype: int64

In [41]:
#Use .iloc for implict row index
d.loc['R1'] == d.iloc[0]


C1    True
C2    True
Name: R1, dtype: bool

In [49]:
# Select one element of the table
d['C1']['R1'] #[Key/Column][Row/Index]

1

### Add, modify and remove rows/columns

In [332]:
#use a copy for demonstration
df_copy = df.copy()

In [334]:
#add row/column using Series. 
#NaN generated for missing data
df_copy.loc['Store 4'] = pd.Series({'Name':'Kevin', 'Item Purchased':'Rice'})
df_copy['Year'] = pd.Series({'Store 1':'2010','Store 2':'2011','Store 3':'2012', 'Store 4':'2020'})
df_copy

Unnamed: 0,Name,Item Purchased,Cost,Store 1,Store 2,Store 3,Store 4
Store 1,Chris,Dog Food,22.5,,,,
Store 2,Kevyn,Kitty Litter,2.5,,,,
Store 3,Vinod,Bird Seed,5.0,,,,
Store 4,Kevin,Rice,,,,,
Year,,,,2010.0,2011.0,2012.0,2020.0


add a row using `.append()`

In [338]:
#same as the .loc[ ] method
df_copy.append(pd.Series({'Name':'Kevin', 'Item Purchased':'Rice'}, name = 'Store 4'))

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,22.5
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0
Store 4,Kevin,Rice,
Store 4,Kevin,Rice,


In [277]:
#Dict re-write the index/columns, leaving NaN for unspecified.
df_copy.loc['Store 4'] = {'Name':'Kevin', 'Item Purchased':'Bread'}
df_copy

Unnamed: 0,Name,Item Purchased,Cost,Year
Store 1,Chris,Dog Food,22.5,2010.0
Store 2,Kevyn,Kitty Litter,2.5,2011.0
Store 3,Vinod,Bird Seed,5.0,2012.0
Store 4,Kevin,Bread,,


In [280]:
#drop data. returns a new df
df_copy = df_copy.drop(columns=['Year'], index=['Store 4']) #default inplace=False
df_copy

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,22.5
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0


## Missing values

In [370]:
df = pd.read_csv('log.csv')
df.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


In [None]:
#sort by index
df = df.set_index('time')#set 'time' as index
df = df.sort_index() #sort by 'time'
df

In [373]:
#Use .fillna() to fill all NaN and None
df = df.fillna(method='ffill') #fill value one row above
df.head()

#can also fill with same length Series

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,False,10.0
2,1469974544,cheryl,intro.html,9,False,10.0
3,1469974574,cheryl,intro.html,10,False,10.0
4,1469977514,bob,intro.html,1,False,10.0
