pd.DataFrame.pipe: Increase readability of code when applying multiple func to DataFrame

In [1]:
import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)

In [2]:
!pip install textblob



In [3]:
from textblob import TextBlob
import pandas as pd

In [4]:
def remove_white_spaces(df:pd.DataFrame):
  df['text'] = df['text'].apply(lambda row: row.strip())
  return df

def get_sentiment(df:pd.DataFrame):
  df['sentiment'] = df['text'].apply(lambda row: TextBlob(row).sentiment[0])
  return df

df = pd.DataFrame({'text':["It is beautiful day today "," This movie is terrible"]})

df = (df.pipe(remove_white_spaces).pipe(get_sentiment))
df


Unnamed: 0,text,sentiment
0,It is beautiful day today,0.85
1,This movie is terrible,-1.0


In [5]:
s = pd.Series(['a','b','c'])
s.map({'a':1,'b':2,'c':3})#u can change values using dict

0    1
1    2
2    3
dtype: int64

In [7]:
#assign value multiple new col 
df1 = pd.DataFrame({'col1':[1,2],'col2':[3,4]})
df1

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [8]:
df1 = df1.assign(col3 = lambda x: x.col1 * 100 + x.col2).assign(col4 = lambda x: x.col1 * x.col3)
df1

Unnamed: 0,col1,col2,col3,col4
0,1,3,103,103
1,2,4,204,408


Trabsform each element in an iterable to a row 

In [9]:
df2 = pd.DataFrame({'a':[[1,2],[4,5]],'b':[11,13]})
df2

Unnamed: 0,a,b
0,"[1, 2]",11
1,"[4, 5]",13


In [11]:
df2.explode('a')

Unnamed: 0,a,b
0,1,11
0,2,11
1,4,13
1,5,13


Split string into multiple rows

In [13]:
df3 = pd.DataFrame({'a':["1,2","4,5"],'b':[11,13]})
df3

Unnamed: 0,a,b
0,12,11
1,45,13


In [14]:
df3.a = df3.a.str.split(",")
df3

Unnamed: 0,a,b
0,"[1, 2]",11
1,"[4, 5]",13


In [15]:
df3.explode('a')

Unnamed: 0,a,b
0,1,11
0,2,11
1,4,13
1,5,13


Bin a df into discrete intervels

In [16]:
df4 = pd.DataFrame({'a':[2,4,8,10,12,14,16,18,20]})
bins = [0,5,10,15,20]
df4['binned'] = pd.cut(df4['a'],bins=bins)
df4

Unnamed: 0,a,binned
0,2,"(0, 5]"
1,4,"(0, 5]"
2,8,"(5, 10]"
3,10,"(5, 10]"
4,12,"(10, 15]"
5,14,"(10, 15]"
6,16,"(15, 20]"
7,18,"(15, 20]"
8,20,"(15, 20]"


If you want to trim values that the outliers one of the method is to use df.clip

In [19]:
data = {'col0':[9,-3,0,45,-1,5]}

df5 = pd.DataFrame(data)
df5

Unnamed: 0,col0
0,9
1,-3
2,0
3,45
4,-1
5,5


In [21]:
lower= df5.col0.quantile(0.05)
upper = df5.col0.quantile(0.95)
df5.clip(lower=lower,upper=upper)

Unnamed: 0,col0
0,9.0
1,-2.5
2,0.0
3,36.0
4,-1.0
5,5.0
