In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
dataset = pd.read_csv('Data.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
dataset['Salary'].mean(skipna = True)

63777.77777777778

## 1. Taking care of missing data
1. Using Pandas

`axis=0` argument calculates the column wise mean of the dataframe and `axis=1` is row wise mean

Ref links:  
- [Pandas mean calculation for missing values](https://www.geeksforgeeks.org/python-pandas-dataframe-mean/)
- [Pandas Doc](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html)
- [Using apply fn](https://stackoverflow.com/questions/18689823/pandas-dataframe-replace-nan-values-with-average-of-columns)

In [4]:
dataset['Salary'].isnull()

0    False
1    False
2    False
3    False
4     True
5    False
6    False
7    False
8    False
9    False
Name: Salary, dtype: bool

In [5]:
dataset['Salary'] = dataset['Salary'].fillna(dataset['Salary'].mean())
dataset['Salary']

0    72000.000000
1    48000.000000
2    54000.000000
3    61000.000000
4    63777.777778
5    58000.000000
6    52000.000000
7    79000.000000
8    83000.000000
9    67000.000000
Name: Salary, dtype: float64

In [7]:
dataset['Age'] = dataset['Age'].fillna(dataset['Age'].mean())
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## 1. Taking care of missing data
2. Using sklearn

In [10]:
# Importing the dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [11]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

[sklean impute docs](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)

In [9]:
from sklearn.impute import SimpleImputer

In [12]:
imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imputer

SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)

In [13]:
imputer = imputer.fit(X[:, 1:3])
imputer

SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)

In [14]:
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)