# Imputing missing values

In [1]:
import pandas as pd
from io import StringIO

In [2]:

csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data))
print(df)


      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN


In [3]:
df.isnull().sum()

Unnamed: 0,0
A,0
B,0
C,1
D,1


Convenient data handling with pandas' data frames

In [4]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

## One of the easiest ways to deal with missing data is simply to remove the corresponding features (columns) or training examples (rows) from the dataset

In [5]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [6]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [7]:
# only drop rows where all columns are NaN
# (returns the whole array here since we don't
# have a row with all values NaN)
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [8]:
# drop rows that have fewer than 4 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [9]:
# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


##Imputing missing values

In [10]:
from sklearn.impute import SimpleImputer
import numpy as np
imr=SimpleImputer(missing_values=np.nan, strategy='mean')
imr=imr.fit(df.values)
imputed_data=imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [11]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [12]:
df.mean()

Unnamed: 0,0
A,5.333333
B,6.333333
C,7.5
D,6.0


# Handling categorical data

In [13]:
df = pd.DataFrame([
['green', 'M', 10.1, 'class2'],
['red', 'L', 13.5, 'class1'],
['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


**Mapping ordinal features**

In [14]:
size_mapping={
    'M':0,
    'L':1,
    'XL':2
}

df['size']=df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,0,10.1,class2
1,red,1,13.5,class1
2,blue,2,15.3,class2


In [15]:
inv_size_mapping={v:k for k,v in size_mapping.items()}
df['size'].map(inv_size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,0,10.1,class2
1,red,1,13.5,class1
2,blue,2,15.3,class2


**Encoding class labels**

In [27]:
le = LabelEncoder()
le.fit(df['classlabel'])
list(le.classes_)
df.classlabel=le.transform(df.classlabel)
df

Unnamed: 0,color,size,price,classlabel
0,green,0,10.1,1
1,red,1,13.5,0
2,blue,2,15.3,1


In [28]:
df['classlabel']=le.inverse_transform(df['classlabel'])
df

Unnamed: 0,color,size,price,classlabel
0,green,0,10.1,class2
1,red,1,13.5,class1
2,blue,2,15.3,class2


**Performing one-hot encoding on nominal features**

In [29]:
from sklearn.preprocessing import OneHotEncoder
X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
X

array([['green', 0, 10.1],
       ['red', 1, 13.5],
       ['blue', 2, 15.3]], dtype=object)

In [30]:
 color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [31]:
from sklearn.compose import ColumnTransformer
X=df[['color', 'size', 'price']].values
c_transf=ColumnTransformer([('onehot', OneHotEncoder(),[0]),
                            ('nothing', 'passthrough', [1,2])])
c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  0. , 10.1],
       [ 0. ,  0. ,  1. ,  1. , 13.5],
       [ 1. ,  0. ,  0. ,  2. , 15.3]])

In [32]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,0,False,True,False
1,13.5,1,False,False,True
2,15.3,2,True,False,False


# Partitioning a dataset into separate training and test datasets

In [34]:
df_wine = pd.read_csv(
 'https://archive.ics.uci.edu/ml/'
 'machine-learning-databases/wine/wine.data',
 header=None)

In [35]:
df_wine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [36]:
 df_wine.columns = ['Class label', 'Alcohol',
                    'Malic acid', 'Ash',
                    'Alcalinity of ash', 'Magnesium',
                    'Total phenols', 'Flavanoids',
                    'Nonflavanoid phenols',
                    'Proanthocyanins',
                    'Color intensity', 'Hue',
                    'OD280/OD315 of diluted wines',
                    'Proline']

In [37]:
np.unique(df_wine['Class label'])

array([1, 2, 3])

In [38]:
 df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [39]:
from sklearn.model_selection import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

# Bringing features onto the same scale