# Pandas `Important` Miscellaneous Operations

___
Example:

In [1]:
import pandas as pd
df = pd.DataFrame({'c1':[1,2,3,4],'c2':[444,555,666,444],'c3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,c1,c2,c3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


___
## Unique Values
* Finding unique values in a dataframe

In [3]:
# The basic method to return a numpy array with the unique values in column 2
df['c2'].unique()

array([444, 555, 666], dtype=int64)

In [4]:
# To find the length of the unique values
len(df['c2'].unique())

3

In [5]:
# The special method the replace the above code (finding length)
df['c2'].nunique()

3

In [6]:
# To get a table of number of instances for unique values
df['c2'].value_counts()

444    2
555    1
666    1
Name: c2, dtype: int64

___
## Data Selection

In [2]:
#Select from dataframe using criteria (boolean) from multiple columns
newdf = df[(df['c1']>2) & (df['c2']==444)]
newdf

Unnamed: 0,c1,c2,c3
3,4,444,xyz


___
## Applying Functions: .apply()

Example:

In [3]:
def t2(x):
    return x*2

In [4]:
# Applying a function onto a column with the .apply() method
df['c1'].apply(t2)

0    2
1    4
2    6
3    8
Name: c1, dtype: int64

In [5]:
# Applying a lambda function onto a column with the .apply() method
df['c2'].apply(lambda x: x**3)

0     87528384
1    170953875
2    295408296
3     87528384
Name: c2, dtype: int64

In [11]:
# Apply len() for each string data in column c3
df['c3'].apply(len)

0    3
1    3
2    3
3    3
Name: c3, dtype: int64

___
## .sum() a column

In [12]:
df['c1'].sum()

10

___
## Column Removal (Permanent)

In [13]:
del df['c1']
df

Unnamed: 0,c2,c3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


___
## Access column & index names

In [16]:
df.columns

Index(['c1', 'c2', 'c3'], dtype='object')

In [17]:
df.index

RangeIndex(start=0, stop=4, step=1)

___
## Dataframe Sort & Order

In [18]:
df

Unnamed: 0,c1,c2,c3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [19]:
# inplace=False by default
df.sort_values(by='c2') 

Unnamed: 0,c1,c2,c3
0,1,444,abc
3,4,444,xyz
1,2,555,def
2,3,666,ghi


___
## Null Values (NaN) Check

In [20]:
df.isnull()

Unnamed: 0,c1,c2,c3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [21]:
# Drop rows with NaN Values (See Missing Data.ipynb)
df.dropna()

Unnamed: 0,c1,c2,c3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


A different example with NaN values:

In [22]:
import numpy as np
df1 = pd.DataFrame({'col1':[1,2,3,np.nan], 'col2':[np.nan,555,666,444], 'col3':['abc','def','ghi','xyz']})
df1.head()

Unnamed: 0,col1,col2,col3
0,1.0,,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,,444.0,xyz


In [26]:
# The .fillna() method
df1.fillna('FILLME')

Unnamed: 0,col1,col2,col3
0,1.0,FILLME,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,FILLME,444.0,xyz


___
## Pivot Table (For Excel Users)
* Can be used for heatmaps.

In [28]:
# A table with multi indexes
data = {'A':['foo','foo','foo','bar','bar','bar'], 'B':['one','one','two','two','one','one'], 'C':['x','y','x','y','x','y'], 'D':[1,3,2,5,4,1]}
df2 = pd.DataFrame(data)
df2

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


In [29]:
# The .pivot_table() method here processes 3 arguments
# Values are the datapoints from column D
# Set index as the columns A and B
# Actual column defined as column C
df2.pivot_table(values='D',index=['A', 'B'],columns=['C'])

Unnamed: 0_level_0,C,x,y
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,1.0
bar,two,,5.0
foo,one,1.0,3.0
foo,two,2.0,
