In [2]:
import pandas as pd 
import numpy as np 
from io import StringIO
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [4]:
#Determinate number of nulls
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [5]:
#Delete values NAN or null in rows
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [6]:
#Delete values NAN or null in columns
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [7]:
#Discard rows with columns NAN or nulls
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [8]:
#Discard rows with less than four values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [9]:
#Discard values in determinate columns
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [10]:
#Original matriz
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [11]:
#Impute missing values
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [12]:
#Categorical data
df_categorical = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])
df_categorical.columns = ['color', 'size', 'price', 'classlabel']
df_categorical

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [13]:
#Map ordinal features

size_mapping = {'XL': 3,
                'L': 2,
                'M': 1}

df_categorical['size'] = df_categorical['size'].map(size_mapping)
df_categorical

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [14]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df_categorical['size'].map(inv_size_mapping)


0     M
1     L
2    XL
Name: size, dtype: object

In [15]:
#Strings to integers class labels
class_mapping = {label: idx for idx, label in enumerate(np.unique(df_categorical['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [16]:
df_categorical['classlabel'] = df_categorical['classlabel'].map(class_mapping)
df_categorical

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [17]:
#Strings to integers class labels with skelearn
class_le = LabelEncoder()
y = class_le.fit_transform(df_categorical['classlabel'].values)
y

array([0, 1, 0], dtype=int64)

In [18]:
#OneHotEncode
X  = df_categorical[['color', 'size', 'price']]
X

Unnamed: 0,color,size,price
0,green,1,10.1
1,red,2,13.5
2,blue,3,15.3


In [19]:
pd.get_dummies(X)

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,0,0,1
2,3,15.3,1,0,0
