In [17]:
import pandas as pd
from io import StringIO
import sys
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,,12.0,'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,,12.0,


In [18]:
df.isnull().sum()  # sum up values in each column

A    0
B    1
C    1
D    1
dtype: int64

In [19]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,True,False,True


In [20]:
df.dropna(axis=0)  # drop raws with enpty values

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [21]:
df.dropna(axis=1)

Unnamed: 0,A
0,1.0
1,5.0
2,10.0


In [22]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,,12.0,


In [23]:
df.dropna(thresh=3)  # keep the raws with >= 3 features

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0


In [25]:
df.dropna(subset=['C'])  # remove data that are empyty in C column

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,,12.0,


In [31]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)  # estimate parameters (mean here)
imr.transform(df.values)  # replace nan with estimated parameters

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. ,  4. , 12. ,  6. ]])

In [33]:
import pandas as pd
df2 = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'], 
                   ['blue', 'XL', 15.3, 'class2'], 
                   ['green', 'XL', 12.5, 'class2']])
df2.columns = ['color', 'size', 'price', 'classlabel']
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2
3,green,XL,12.5,class2


In [47]:
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df2['size'] = df2['size'].map(size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2
3,green,3,12.5,class2


In [44]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df2['size'] = df2['size'].map(inv_size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2
3,green,XL,12.5,class2


In [37]:
import numpy as np
class_mapping = {label: idx for idx, label in enumerate(np.unique(df2['classlabel']))}
df2['classlabel'] = df2['classlabel'].map(class_mapping)
df2  # nominal label. can fit label and number randomly

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1
3,green,3,12.5,1


In [41]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df2['classlabel'] = df2['classlabel'].map(inv_class_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2
3,green,3,12.5,class2


In [42]:
class_mapping.items()

dict_items([('class1', 0), ('class2', 1)])

In [40]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df2['color'].values)
y

array([1, 2, 0, 1])

In [43]:
class_le.inverse_transform(y)

array(['green', 'red', 'blue', 'green'], dtype=object)

In [57]:
X = df2.values
# one-hot encoding: avoid ML training process assume the data have some order
# transform one feture to a serious of boolean feature
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ohot = ColumnTransformer([('color', OneHotEncoder(categories='auto'), [0]), 
                          ('size', 'passthrough', [1]), 
                          ('price', 'passthrough', [2])])
ohot.fit_transform(X)

array([[0.0, 1.0, 0.0, 1, 10.1],
       [0.0, 0.0, 1.0, 2, 13.5],
       [1.0, 0.0, 0.0, 3, 15.3],
       [0.0, 1.0, 0.0, 3, 12.5]], dtype=object)

In [56]:
pd.get_dummies(df2[['price', 'color', 'size', 'classlabel']]) # one-hot encoding via pandas

Unnamed: 0,price,size,color_blue,color_green,color_red,classlabel_class1,classlabel_class2
0,10.1,1,0,1,0,0,1
1,13.5,2,0,0,1,1,0
2,15.3,3,1,0,0,0,1
3,12.5,3,0,1,0,0,1
