# Pandas

* Manipulate & analyse ***tubular*** data
* Data stuctures: ***DataFrame*** & ***Series***

---

In [1]:
!pip install pandas


[notice] A new release of pip available: 22.1.2 -> 22.3
[notice] To update, run: python.exe -m pip install --upgrade pip


##### Import Packages


In [2]:
import pandas as pd
import numpy as np

#!pip install pandas
pd.__version__

'1.3.2'

# 1. Series
* 1D-list structure
* with ***index*** but no column
* a column in a DataFrame

In [4]:
# Create a Series using a list
# ser = pd.Series(['red', 'blue', 'green', 'yellow'])
ser = pd.Series(['red', 'blue', 'green', 'yellow'], index=['a', 'b', 'c', 'd'])

# type(ser)
ser

a       red
b      blue
c     green
d    yellow
dtype: object

In [6]:
ser.ndim
# ser.shape
# ser.size # row 

# ser.index
# ser.describe()
# ser.value_counts() # value_counts

# ser[3] # position
# ser[1:] # slice

1

# 2. DataFrame 

* 2D-table structure
* most frequently used stucture in pandas
* with ***columns*** & ***index***
* index could be number or string

In [7]:
# creating dataframe (using dictionary)

df = pd.DataFrame({'name': ['Paul', 'John', 'Mary', 'Jane'],
                   'phone': ['92435678', '90909090', '09090909', '12435678'],
                   'gender': ['male', 'male', 'female', 'female'],
                   'age': [18, 22, 23, np.nan], 
                   'colA': ser},
                  index=['a', 'b', 'c', 'd'])
df

Unnamed: 0,name,phone,gender,age,colA
a,Paul,92435678,male,18.0,red
b,John,90909090,male,22.0,blue
c,Mary,9090909,female,23.0,green
d,Jane,12435678,female,,yellow


In [8]:
type(df)
# df.ndim
# df.shape
# df.size # row x col

# df.columns
# df.index
# df.set_index('name') # changing index to other column

# df.info()
# df.describe()
# df.count()

# df.shape[0]

pandas.core.frame.DataFrame

## <font color=red>Column</font>

In [9]:
# df.name # dot notation
# df['name'] # series

# fancy index
# df[['name']] # it is a single column DATA FRAME! 
# df[['name','age']]  # select multiple column (DF)
# df[['name','name']] # can be repeated, return a DF

df['color'] = ser # add a column
# df.assign(color2=ser) # add a column using assign function
df = df.assign(color2=ser)

# WARNING: to add a column, always use ['colname'], instead of dot notation
# df.color = ser # update ok
# df.color3 = ser # create new column, not working

In [10]:
# filter() - filter (column name / index name)

df.filter(['name']) # filter by columns (dataframe)
# df.filter(items=['name','age'], axis='columns') # filter by columns (dataframe)

# df.filter(like='col')
# df.filter(regex='r$') # regex expression end with color

# df.filter(items=['a','b'], axis='index') # select rows

Unnamed: 0,name
a,Paul
b,John
c,Mary
d,Jane


In [11]:
# df.values
df['color'][0] # series[pos]
# df.age.mean()
# df.age.median()
# df.age.max() # get max item
# df.age.idxmax() # get index value of max item
# df.age = df.age + 1
# df

# df.gender.value_counts()
# -(df.age)

'red'

### Update data

In [12]:
# df.color2 = 'unknown'
df['color2'] = '-'
# df['color2'] = df['color']
# df['color2'] = df['color'] + ' color'
# df['colB'] = df['colA'] * 2 # update each item of colB 
# df.colB = df.colA * 2
# df.colB[1] # this can get the value, but you cannot assign value to it
# df.colB[1] = 'AAA' # not working

# but you can use loc/iloc indexer to update the value (will be cover later)
# df.loc['b', 'colB'] = 'AAA' # but you can use loc/iloc indexer to update the value (will be cover later)
# df.iloc[1,5] = 'CCC' 
# df
# df

In [13]:
# no column name, no index name
tmpDf = df.copy()  # create a new copy of dataframe

# use fancy index to get a view of DF
tmpDf[['phone', 'gender']] = 'X' # it will also update data of the original dataframe
df

Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,18.0,red,red,-
b,John,90909090,male,22.0,blue,blue,-
c,Mary,9090909,female,23.0,green,green,-
d,Jane,12435678,female,,yellow,yellow,-


In [12]:
# use loc indexer (label based) to update data
# Syntax: df.loc[row(index), col] # MUST USE LABEL
# tmpDf.loc[:, ['colA','color2']] = 'red'

# use iloc indexer (position based) to update data
# Syntax: df.iloc[row(index), col] # MUST USE POSITION
# tmpDf.iloc[:, 4] = 'blue'

# why they can do it? they return series or dataframe

##### Column.str functions
* https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#string-methods

In [14]:
df.name.str.upper()
# df.gender.str.count('e')  # letter e occurance
# df.phone.str.count('^90') # start with 90
# df.gender.str.len()
# df.name.str.split('a') # return a series
# df.name.str.split('a', expand=True) # expanded to a data frame
# df.name.str.contains('^P')
# df.name.str.startswith('P')
# df.name.str.endswith('e')
# df.phone.str.replace('90', 'Ninty')

a    PAUL
b    JOHN
c    MARY
d    JANE
Name: name, dtype: object

In [15]:
# most are same as python's native string functions
# '90909090'.replace('90','Ninty')
# '90909090'.count('90')

# str.findall (re) - return a series
# df.phone.str.findall('^9') 
# df.phone.str.findall('90')
df.phone.str.findall('\d\d\d\d') # 4-digit group

# not str method
# import re 
# re.findall('^9', '90909090')

a    [9243, 5678]
b    [9090, 9090]
c    [0909, 0909]
d    [1243, 5678]
Name: phone, dtype: object

#### **Replace data** - replace()
* series
* data frame

In [15]:
# series

# df.name.replace('Paul', 'Mr. Chan', inplace=False) # replace a string (exact match)
# df.gender.replace({'male':'M', 'female':'F'})  # replace with values in dictionary
# df.name.replace(['Paul','John'], '****')
# df.name.replace('a', '@', regex=True) # regular expression - series example
# df.name.str.replace('a','@') # str.replace equivalent
# df.name.replace('^Pa', 'Mr. P@', regex=True) # regular expression
# df.name.replace(regex='^Pa', value='Mr. P@') # regular expression = to_replace
# df.name.replace(regex={'^Pa':'Mr. P@', 'ry':'rY'}) # regular expression = dict with regex & value

# df.phone.replace(92435678, 99998888) # Not working, df.phone.dtype is Object
# df.phone.astype(int).replace(92435678, 99998888) # convert it to int before replace

# df.age.replace(np.nan, df.age.mean()) # can also use fillna(0)

In [16]:
# data frame

# df.replace('a', '@', regex=True) # regular expression
# df.replace(['Paul','John','male'], '****') # list example (for all columns)

# key of dictionary is now column name!
# df.replace({'name':'Paul', 'gender':'male'}, value='xxxx') # replace name=X OR gender=Y with a value
# df.replace({'name':'Paul', 'gender':'male'}, value='xxxx', regex=True) # now X & Y are regular expressions

# df.replace({'a':'@', 'l':'!'}, regex=True) # if value is None / omitted, dict-key is to_replace, dict-value is value

#### **Update data**

1. **apply()** - series & df
1. applymap() - df only
1. map() - series only

##### 1. apply()
* for **series & data frame**
* accept **function**

![apply ser](apply-ser.png)


![apply df1](apply.png)
![apply df2](apply-h.png)

In [17]:
df

Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,18.0,red,red,red
b,John,90909090,male,22.0,blue,blue,blue
c,Mary,9090909,female,23.0,green,green,green
d,Jane,12435678,female,,yellow,yellow,yellow


In [18]:
# 1) pass function
# df.apply(max) # on data frame
# df.name.apply(len) # apply len function on each item of this series

# df.age.apply(np.square) # apply np.square function to each item of this series

# 2a) pass custom function (DF)
# def max_df(x): # x is series (each column)
#     print(type(x)) # it's a series
#     return x.max()

# df.apply(max_df) # apply on df's columns, return a series

In [19]:
# 2b) pass custom function (Series)
# def upper_ser(x):
# #     print(type(x)) # x is the val of each column data
#     return x.upper()

# df.name.apply(upper_ser) # apply on series's item

# 3) pass custom function (with argument)
# def upper_ser_arg(x, length):
# #     print(type(x)) # x is the val of each column data
#     return x.upper()[:length]

# df.name.apply(upper_ser_arg, args=(4,)) # apply on series's item (comma is required)

In [16]:
# 4) anoymous function
# df.apply(lambda x: x.max()) # on df
# df.name.apply(lambda x: x.upper()) # on series

# 5) for df.apply, you can pass axis
# * 0 or 'index': apply function to each column. [DEFAULT]
# * 1 or 'columns': apply function to each row.
# df.apply(max, axis='index') # each column max

# df[['age']].apply(np.sum, axis = 'index') # compute sum of each column
df[['age']].apply(np.sum, axis = 'columns') # compute sum of each row

a    18.0
b    22.0
c    23.0
d     0.0
dtype: float64

In [17]:
# result_type is only available for df.apply
# result_type is for multi-value item

# expand: list-like results will be turned into columns (return data frame if possible)
# reduce: returns a Series if possible rather than expanding list-like results, opposite of 'expand'
# broadcast: results will be broadcast to the original shape of the DataFrame, the original index and columns will be retained

listLikeResult = [0,1,2,3,4,5,6]
# df.apply(lambda x:listLikeResult, axis=1, result_type='reduce')  # return a series
# df.apply(lambda x:listLikeResult, axis=1, result_type='expand') # return a dataframe if possible
# df.apply(lambda x:listLikeResult, axis=1, result_type='broadcast') # broadcast keep the original shape, with col name & index

listLikeResult = 1
# df.apply(lambda x:listLikeResult, axis=1, result_type='expand') # no differene for expand and reduce
df.apply(lambda x:listLikeResult, axis=1, result_type='broadcast') # broadcast keep the original shape


Unnamed: 0,name,phone,gender,age,colA,color,color2
a,1,1,1,1,1,1,1
b,1,1,1,1,1,1,1
c,1,1,1,1,1,1,1
d,1,1,1,1,1,1,1


##### 2. applymap()
* **data frame only**
* apply a function to **every element** of a DataFrame

![applymap](applymap.png)

In [19]:
# 1) function
df[['age']].applymap(np.square) # square every element in the data frame

# 2) custom function
# def count_character(x):
# #     print(type(x))
#     return len(x)

# df[['name', 'gender','phone']].applymap(count_character)


Unnamed: 0,age
a,324.0
b,484.0
c,529.0
d,


##### 3. map()
* **series only**
* accept **dictionary** or **function** for substitution

![map](map.png)

In [21]:
# 1) substitute values using a map
sub_dict = {'Paul':'Mr. Chan', 'John':'Mr. Wong'}
df.name.map(sub_dict) # with some NaN values if key is not found
# df.name.replace(sub_dict) # with some NaN values if key is not found

# 2) substitute values with a format
# df.name.map('Hello, {}'.format)

# 2) substitute values with anonymous function
# df.name.map(lambda x: f'Hey, {x}') # x is each item

# 3a) substitute values with a function
# df.age.map(np.square)

# 3b) substitute values with a function
# def changeName(name):
#     name = 'person-' + name + '-001'
#     return name

# df.name = df.name.map(changeName)
# df
# df.name.map(changeName)

a    Mr. Chan
b    Mr. Wong
c         NaN
d         NaN
Name: name, dtype: object

#### Drop a row / column

In [22]:
# Drop Row
df.drop('d', inplace=False)  # drop a row with a index value (axis = 0 / 'index', default)
# df.drop(['a','d']) # drop multiple rows

# Drop Column
# df.drop('colA', axis='columns') # drop a column with a column name
# df.drop(['phone','colA'], axis=1)

Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,18.0,red,red,-
b,John,90909090,male,22.0,blue,blue,-
c,Mary,9090909,female,23.0,green,green,-


## <font color=red>Rows</font>

In [23]:
# df[0] # WARNING: Position is not working!!! (it's for col name)
# df['a'] # WARNING: Index value is not working!!! (please use loc indexer)

# slice operation (rows)
df[:] # all rows
# df[:2] # row 0 to 1 
# df[::-1] # reverse order

# Filter()
# Axis 0: rows / index
# Axis 1: columns
# df.filter('a', axis='index')
# df.filter('a',axis=0)
# df.filter(['a','b'],axis=0)


Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,18.0,red,red,-
b,John,90909090,male,22.0,blue,blue,-
c,Mary,9090909,female,23.0,green,green,-
d,Jane,12435678,female,,yellow,yellow,-


### <font color=blue>**Indexer**</font>

+ loc (label)
+ iloc (position)

#### **df.loc indexer (label)**

In [26]:
# 1. LOC indexer: 
# SYNTAX: df.loc[row, col]

# df.loc['a'] # get 1 row (series)
# df.loc[['a','b']] # get multiple rows
# df.loc[:] # slice
# df.loc['a':'c'] # slice (including c)
# df.loc[['b','b', 'a','a']] # fancy index

# df.loc[:, 'name']
# df.loc[:, ['name', 'age']] # all rows, with name & age column

# BE CAREFUL!!!
# if index is not numbers, you cannot use df.loc[0]
# df.loc[0] # KeyError: no such index value


#### **df.iloc indexer (position)**

In [27]:
# 1. ILOC indexer: 
# SYNTAX: df.iloc[row, col]

# df.iloc[0] # first row
# df.iloc[:] # slice: all rows
# df.iloc[0:1] # 0 to 1 (not include 1)

# df.iloc[:, :] # all rows, all columns
# df.iloc[:, 0] # all rows, first column
# df.iloc[:, [-1]] # all rows, last column
# df.iloc[:, 1]

# BE CAREFUL!!!
# Since it is based on POSITION, you cannot use col labels
# df.iloc[:, 'age'] # not working
# df.iloc[:, ['age']]  # not working


### <font color=Green>**Boolean Mask**</font>

In [28]:
mask = df.age > 20
# ~mask # negate it

# df[mask] # retrieve the results
df.loc[mask] # retrieve the results

# BE CAREFUL!!!
# df.iloc[mask] # iloc indexer cannot use boolean mask!!!

Unnamed: 0,name,phone,gender,age,colA,color,color2
b,John,90909090,male,22.0,blue,blue,-
c,Mary,9090909,female,23.0,green,green,-


### <font color=Green>**Query()**</font>

In [29]:
# df.query('age == 22')
# df.query('age > 20')
# df.query('name=="Paul" & age>10')

# Using Python str functions
# df.query('name.str.contains("J")', engine='python')
# df.query('name.str.upper()=="PAUL"', engine='python')

# BE CAREFUL!!!
# df.query('name.str.upper().contains("P")', engine='python') # NOT WORKING
# df.query('name.str.upper().str.contains("P")', engine='python') # WORKING


### Missing Values 

In [30]:
# handing NA values
# which element is null

# df.loc[df.age.isna()]

# df[df.age.isnull()] 
# df[df.age.isna()] 
# df[~df.age.isnull()]  # is not NA
# df[df.age.notnull()]  
# df[df.age.notna()]  

# df.age.fillna(df.age.mean(), inplace=False) # fill the values (inplace=True if you want to save it)
df.age.replace(np.nan, df.age.mean(), inplace=False) # use replace()


a    18.0
b    22.0
c    23.0
d    21.0
Name: age, dtype: float64

## <font color=blue>Groupby</font>

##### * divide some data input groups (e.g. gender)

In [31]:
df

Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,18.0,red,red,-
b,John,90909090,male,22.0,blue,blue,-
c,Mary,9090909,female,23.0,green,green,-
d,Jane,12435678,female,,yellow,yellow,-


In [33]:
df.groupby('gender').mean()
# df.groupby('gender').age.mean()
# df.groupby('gender')['age'].mean()

Unnamed: 0_level_0,age
gender,Unnamed: 1_level_1
female,23.0
male,20.0


In [34]:
# group by multiple columns
df.groupby(['gender', 'name']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,age
gender,name,Unnamed: 2_level_1
female,Jane,
female,Mary,23.0
male,John,22.0
male,Paul,18.0


In [35]:
df.groupby(['gender', 'name']).mean().unstack()

Unnamed: 0_level_0,age,age,age,age
name,Jane,John,Mary,Paul
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,,,23.0,
male,,22.0,,18.0


In [36]:
# pivot table function
# df.pivot_table('age', index='gender', columns='name')
df.pivot_table('age', index='gender', columns='name', aggfunc='mean') # aggfunc other than mean


name,John,Mary,Paul
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,,23.0,
male,22.0,,18.0


## <font color=blue>Sorting</font>

In [37]:
# df.sort_values(by='age', ascending=False)
df.sort_values(by=['gender', 'age'], ascending=[False, True])

Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,18.0,red,red,-
b,John,90909090,male,22.0,blue,blue,-
c,Mary,9090909,female,23.0,green,green,-
d,Jane,12435678,female,,yellow,yellow,-
