## Dealing with missing data

In [204]:
import pandas as pd
from io import StringIO

In [205]:
csv_data = '''A,B,C,D
           1.0,2.0,3.0,4.0
           5.0,6.0,,8.0
           10.0,11.0,12.0,'''

In [206]:
# Read in a string as it was a regular csv file with StringIO
df = pd.read_csv(StringIO(csv_data))

In [207]:
# Original dataframe
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [208]:
# Return the number of missing values per column
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

#### Eliminating samples

In [209]:
# Remove any features or samples with missing values
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [210]:
# Drop columns that have at least one NaN in any row by setting axis=1
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [211]:
# only drop row where all columns are NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [212]:
# only drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [213]:
# only drop rows where NaN appear in specific columns, (e.g 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


#### Imputing missing values

In [214]:
# Mean Imputation; Replace missing values by the mean value of the entire feature column
from sklearn.preprocessing import Imputer

# Replace by mean, axis=0 (calculated for each feature column), axis=1 (calculate the row)
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df)

imputed_data = imr.transform(df.values)

imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [ 10. ,  11. ,  12. ,   6. ]])

---

## Handling categorical data

In [215]:
import pandas as pd

df = pd.DataFrame([
        ['green', 'M', 10.1, 'class1'],
        ['red', 'L', 13.5, 'class2'],
        ['blue', 'XL', 15.3, 'class1']
    ])

df.columns = ['color', 'size', 'price', 'classlabel']

In [216]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


#### Mapping ordinal features

In [217]:
# Use a dictionary to map key-value pairs
size_mapping = { 'XL': 3, 'L': 2, 'M': 1}

# Create the inverse mapping if needed later
inv_size_mapping = {v: k for k, v in size_mapping.items()}

size_mapping, inv_size_mapping

({'L': 2, 'M': 1, 'XL': 3}, {1: 'M', 2: 'L', 3: 'XL'})

In [218]:
df['size'] = df['size'].map(size_mapping)

In [219]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


#### Encoding class labels

In [220]:
import numpy as np

# Convert the string classlabels toto integer values
class_mapping = {label:idx for idx, label in enumerate(np.unique(df['classlabel']))}

# Create the inverse mapping if needed later
inv_class_mapping = {v: k for k, v in class_mapping.items()}

class_mapping, inv_class_mapping

({'class1': 0, 'class2': 1}, {0: 'class1', 1: 'class2'})

In [221]:
df['classlabel'] = df['classlabel'].map(class_mapping)

In [222]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


Alternative method, is to use the convenient ```LabelEncoder``` class in scikit-learn

In [223]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)

In [224]:
y

array([0, 1, 0])

In [225]:
# and for the inverse transform of y
class_le.inverse_transform(y)

array([0, 1, 0])

In [230]:
# Similar approach to transform the nominal color column of our dataset, but this could create issues. (See OHEncoding)
X = df[['color', 'size', 'price']].values

color_le = LabelEncoder()

X[:, 0] = color_le.fit_transform(X[:, 0])

In [227]:
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

## One-hot encoding on nominal features

A common mistake in dealing with categorical data, is when we convert **nominal** values into integers.

```(e.g: blue -> 0, green -> 1, red -> 2)```

Some classifiers are not smart enough and will treat these as ordinal values, meaning that

```red > green > blue```

Which is not what we want..

**Solution is to use one-hot encoding**

Create a new dummy feature for each unique value in the nominal feature column. This way, the feature can act as nomial values instead of having any order to them after the transformation.

In [231]:
from sklearn.preprocessing import OneHotEncoder

# Pass in column position, here 0 is color
ohe = OneHotEncoder(categorical_features=[0])

ohe.fit_transform(X).toarray()

array([[  0. ,   1. ,   0. ,   1. ,  10.1],
       [  0. ,   0. ,   1. ,   2. ,  13.5],
       [  1. ,   0. ,   0. ,   3. ,  15.3]])

In [232]:
# Another way is to utilize get_dummies method implemeted in pandas
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0.0,1.0,0.0
1,13.5,2,0.0,0.0,1.0
2,15.3,3,1.0,0.0,0.0
