# Pandas

* Manipulate & analyse ***tubular*** data
* Data stuctures: ***DataFrame*** & ***Series***

---

##### Import Packages


In [1]:
import pandas as pd
import numpy as np

#!pip install pandas
pd.__version__

'1.1.0'

# 1. Series
* 1D-list structure
* with ***index*** but no column
* a column in a DataFrame

In [2]:
# Create a Series using a list
# ser = pd.Series(['red', 'blue', 'green', 'yellow'])
ser = pd.Series(['red', 'blue', 'green', 'yellow'], index=['a', 'b', 'c', 'd'])

# type(ser)
ser

a       red
b      blue
c     green
d    yellow
dtype: object

In [3]:
# ser.ndim
# ser.shape
# ser.size # row 

# ser.index
# ser.describe()
# ser.value_counts() # value_counts

# ser[3] # position
# ser[1:] # slice

# 2. DataFrame 

* 2D-table structure
* most frequently used stucture in pandas
* with ***columns*** & ***index***
* index could be number or string

In [4]:
# creating dataframe (using dictionary)

df = pd.DataFrame({'name': ['Paul', 'John', 'Mary', 'Jane'],
                   'phone': ['92435678', '90909090', '09090909', '12435678'],
                   'gender': ['male', 'male', 'female', 'female'],
                   'age': [18, 22, 23, np.nan], 
                   'colA': ser},
                  index=['a', 'b', 'c', 'd'])
df

Unnamed: 0,name,phone,gender,age,colA
a,Paul,92435678,male,18.0,red
b,John,90909090,male,22.0,blue
c,Mary,9090909,female,23.0,green
d,Jane,12435678,female,,yellow


In [5]:
# type(df)
# df.ndim
# df.shape
# df.size # row x col

# df.columns
# df.index
# df.set_index('name') # changing index to other column

# df.info()
# df.describe()
# df.count()

df.shape[0]

4

## <font color=red>Column</font>

In [6]:
# df.name # dot notation
# df['name'] # index
# df[['name','age']]  # select multiple column (DF), fancy index

# filter()
# df.filter(['name']) # filter by columns (dataframe)
# df.filter(['name','age'], axis='columns') # filter by columns (dataframe)

# df['color'] = ser # add a column
# df.assign(color2=ser) # add a column using assign function

# df['color'][1] # series[pos]
# df.age.mean()
# df.age.median()
# df.age.max() # get max item
# df.age.idxmax() # get index value of max item
# df.age = df.age + 1
# df

# df.gender.value_counts()
# -(df.age)

# df.name.str.upper()

#### Use of Map function to update Column data

In [7]:
# df.name.map(lambda x: 'person-' + x)

# def changeName(name):
#     name = 'person-' + name + '-001'
#     return name

# df.name = df.name.map(changeName)

###### Drop a row / column

In [8]:
# df.drop('d')  # drop a row with a index value

# df.drop(['a','d']) # drop multiple columns
# df.drop('colA', axis='columns') # drop a column with a column name
# df.drop(['phone','colA'], axis=1)


## <font color=red>Rows</font>

In [9]:
# df[0] # WARNING: Position is not working!!! (it's for col name)
# df['a'] # WARNING: Index value is not working!!! (please use loc indexer)

# slice operation (rows)
# df[:] # all rows
# df[:2] # row 0 to 1 
# df[::-1] # reverse order

# Filter()
# Axis 0: rows / index
# Axis 1: columns
# df.filter('a', axis='index')
# df.filter('a',axis=0)
# df.filter(['a','b'],axis=0)


### <font color=blue>**Indexer**</font>

+ loc (label)
+ iloc (position)

#### **df.loc indexer (label)**

In [10]:
# 1. LOC indexer: 
# SYNTAX: df.loc[row, col]

# df.loc['a'] # get 1 row (series)
# df.loc[['a','b']] # get multiple rows
# df.loc[:] # slice
# df.loc['a':'c'] # slice (including c)
# df.loc[['b','b', 'a','a']] # fancy index

# df.loc[:, 'name']
# df.loc[:, ['name', 'age']] # all rows, with name & age column

# BE CAREFUL!!!
# if index is not numbers, you cannot use df.loc[0]
# df.loc[0] # KeyError: no such index value


#### **df.iloc indexer (position)**

In [11]:
# 1. ILOC indexer: 
# SYNTAX: df.iloc[row, col]

# df.iloc[0] # first row
# df.iloc[:] # slice: all rows
# df.iloc[0:1] # 0 to 1 (not include 1)

# df.iloc[:, :] # all rows, all columns
# df.iloc[:, 0] # all rows, first column
# df.iloc[:, [-1]] # all rows, last column
# df.iloc[:, 1]

# BE CAREFUL!!!
# Since it is based on POSITION, you cannot use col labels
# df.iloc[:, 'age'] # not working
# df.iloc[:, ['age']]  # not working


### <font color=Green>**Boolean Mask**</font>

In [12]:
mask = df.age > 20
# ~mask # negate it

# df[mask] # retrieve the results
df.loc[mask] # retrieve the results

# BE CAREFUL!!!
# df.iloc[mask] # iloc indexer cannot use boolean mask!!!

Unnamed: 0,name,phone,gender,age,colA
b,John,90909090,male,22.0,blue
c,Mary,9090909,female,23.0,green


### <font color=Green>**Query()**</font>

In [13]:
# df.query('age == 22')
# df.query('age > 20')
# df.query('name=="Paul" & age>10')

# Using Python str functions
# df.query('name.str.contains("J")', engine='python')
# df.query('name.str.upper()=="PAUL"', engine='python')

# BE CAREFUL!!!
# df.query('name.str.upper().contains("P")', engine='python') # NOT WORKING
# df.query('name.str.upper().str.contains("P")', engine='python') # WORKING


### Missing Values 

In [14]:
# handing NA values
# which element is null

# df.loc[df.age.isna()]

# df[df.age.isnull()] 
# df[df.age.isna()] 
# df[~df.age.isnull()]  # is not NA
# df[df.age.notnull()]  
# df[df.age.notna()]  

df.age.fillna(df.age.mean(), inplace=True) # fill the values (inplace=True if you want to save it)


## <font color=blue>Groupby</font>

##### * divide some data input groups (e.g. gender)

In [15]:
df

Unnamed: 0,name,phone,gender,age,colA
a,Paul,92435678,male,18.0,red
b,John,90909090,male,22.0,blue
c,Mary,9090909,female,23.0,green
d,Jane,12435678,female,21.0,yellow


In [20]:
# df.groupby('gender').mean()
# df.groupby('gender').age.mean()
# df.groupby('gender')['age'].mean()

gender
female    22.0
male      20.0
Name: age, dtype: float64

In [21]:
# group by multiple columns
df.groupby(['gender', 'name']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,age
gender,name,Unnamed: 2_level_1
female,Jane,21.0
female,Mary,23.0
male,John,22.0
male,Paul,18.0


In [22]:
df.groupby(['gender', 'name']).mean().unstack()

Unnamed: 0_level_0,age,age,age,age
name,Jane,John,Mary,Paul
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,21.0,,23.0,
male,,22.0,,18.0


In [23]:
# pivot table function
# df.pivot_table('age', index='gender', columns='name')
df.pivot_table('age', index='gender', columns='name', aggfunc='mean') # aggfunc other than mean


name,Jane,John,Mary,Paul
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,21.0,,23.0,
male,,22.0,,18.0


## <font color=blue>Sorting</font>

In [27]:
# df.sort_values(by='age', ascending=False)
df.sort_values(by=['gender', 'age'], ascending=[False, True])

Unnamed: 0,name,phone,gender,age,colA
a,Paul,92435678,male,18.0,red
b,John,90909090,male,22.0,blue
d,Jane,12435678,female,21.0,yellow
c,Mary,9090909,female,23.0,green
