## Data Preprocessing

### Dealing with missing values

In [1]:
import pandas as pd
from io import StringIO

In [2]:
csv_data = '''A,B,C,D
              2.0,3.0,,5.0
              6.0,,8.0,9.0
              10.0,11.0,12.0,'''

In [3]:
df = pd.read_csv(StringIO(csv_data))

In [4]:
df

Unnamed: 0,A,B,C,D
0,2.0,3.0,,5.0
1,6.0,,8.0,9.0
2,10.0,11.0,12.0,


In [5]:
df.isnull().sum()

A    0
B    1
C    1
D    1
dtype: int64

In [6]:
dfs = df

In [7]:
dfs.dropna(axis=0)

Unnamed: 0,A,B,C,D


In [8]:
dfs.dropna(axis=1)

Unnamed: 0,A
0,2.0
1,6.0
2,10.0


In [9]:
dfs.dropna(how='all')

Unnamed: 0,A,B,C,D
0,2.0,3.0,,5.0
1,6.0,,8.0,9.0
2,10.0,11.0,12.0,


In [10]:
dfs.dropna(thresh=4)

Unnamed: 0,A,B,C,D


In [11]:
dfs.dropna(subset=['D'])

Unnamed: 0,A,B,C,D
0,2.0,3.0,,5.0
1,6.0,,8.0,9.0


### Imputing missing values

In [12]:
from sklearn.impute import SimpleImputer
import numpy as np

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(df.values)
imputed_data = imputer.transform(df.values)
imputed_data

array([[ 2.,  3., 10.,  5.],
       [ 6.,  7.,  8.,  9.],
       [10., 11., 12.,  7.]])

In [13]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,2.0,3.0,10.0,5.0
1,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,7.0


### Handling categorical data

- Ordinal Feature: Categorical values that can be sorted 
- Nominal Feature: Don't imply any order

In [18]:
df = pd.DataFrame([
    ['M30','5000','B','class1'],
    ['M30s','6000','B','class1'],
    ['Xr','4500','S','class2']
])

In [19]:
df

Unnamed: 0,0,1,2,3
0,M30,5000,B,class1
1,M30s,6000,B,class1
2,Xr,4500,S,class2


In [20]:
df.columns = ['Model','Battery','Size','Classlabel']

In [21]:
df

Unnamed: 0,Model,Battery,Size,Classlabel
0,M30,5000,B,class1
1,M30s,6000,B,class1
2,Xr,4500,S,class2


In [23]:
size_mapping = {'B': 1,
                'S': 2}

df['Size'] = df['Size'].map(size_mapping)
df

Unnamed: 0,Model,Battery,Size,Classlabel
0,M30,5000,1,class1
1,M30s,6000,1,class1
2,Xr,4500,2,class2


### Encoding class labels