# Handling missing values

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import linalg as LAs

In [2]:
np.random.seed(7)

In [5]:
N = 100
x = np.random.randn(N) # random number from normal distribution
y = np.random.rand(N) # random number from uniform distribution

In [8]:
df = pd.DataFrame({'x' : X, 'y' : y})
df.head()

Unnamed: 0,x,y
0,0.27446,0.696696
1,-1.526525,0.427053
2,1.6507,0.13457
3,0.154336,0.331357
4,-0.38714,0.590346


In [9]:
df.loc[5, 'y'] = np.nan
df.loc[3, 'x'] = None

In [10]:
df.head(6)

Unnamed: 0,x,y
0,0.27446,0.696696
1,-1.526525,0.427053
2,1.6507,0.13457
3,,0.331357
4,-0.38714,0.590346
5,2.029072,


In [14]:
df.isna().head(6)

Unnamed: 0,x,y
0,False,False
1,False,False
2,False,False
3,True,False
4,False,False
5,False,True


In [15]:
# Remove rows with one or more NaN values (in place)

df.dropna(axis = 0, how = 'any', inplace = True)

In [18]:
df.head()

Unnamed: 0,x,y
0,0.27446,0.696696
1,-1.526525,0.427053
2,1.6507,0.13457
4,-0.38714,0.590346
6,-0.045386,0.992558


In [19]:
df.loc[3, 'x'] = None

In [21]:
mean_y = df[df['y'].notna()]['y'].mean()
mean_y

0.5069392886343348

In [25]:
df['y'].replace(to_replace = np.nan, value = mean_y, inplace = True) # can not use None
df

Unnamed: 0,x,y
0,0.274460,0.696696
1,-1.526525,0.427053
2,1.650700,0.134570
4,-0.387140,0.590346
6,-0.045386,0.992558
...,...,...
96,-1.233524,0.542632
97,0.182901,0.457599
98,0.022245,0.895367
99,-0.429069,0.057254


In [26]:
df.loc[df['x'].isna(), 'x'] = 1

In [27]:
df

Unnamed: 0,x,y
0,0.274460,0.696696
1,-1.526525,0.427053
2,1.650700,0.134570
4,-0.387140,0.590346
6,-0.045386,0.992558
...,...,...
96,-1.233524,0.542632
97,0.182901,0.457599
98,0.022245,0.895367
99,-0.429069,0.057254


In [None]:
# Example - replacing values for classes

# data['target'].replace({'Yes' : 1, 'No' : 0}, inplace = True)

In [28]:
# Example - columns concatenation

# data = pd.concat([col_1, col_2, col_3], axis = 1)

In [None]:
# Example - columns removing

# data.drop(['col_1', 'col_2'], axis = 1, inplace = True)

In [29]:
# Example - one-hot encoding

# col_1_dummies = pd.get_dummies(data['col_1'], prefix = 'col_1', drop_first = True)

In [None]:
# Note: If dataset contains continuous and categorical features values, only the continuous values should be scaled.