In [6]:
import numpy as np
import pandas as pd

In [7]:
data = pd.read_csv("data.csv")
data

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types
0,3,15.0,45,250000,A
1,1,5.0,30,200000,B
2,2,10.0,38,150000,AB
3,1,,36,180000,O


# **Pandas' Approach to replace the missing values of only one attribute**

# **Option 1: Get rid of the corresponding districts.**

In [8]:
data.dropna(subset=["Working Experience(years)"])  

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types
0,3,15.0,45,250000,A
1,1,5.0,30,200000,B
2,2,10.0,38,150000,AB


# **Option 2: Get rid of the whole attribute.**

In [9]:
data.drop("Working Experience(years)", axis=1) 

Unnamed: 0,Number of Kids,Age,Salary,Blood Types
0,3,45,250000,A
1,1,30,200000,B
2,2,38,150000,AB
3,1,36,180000,O


# **Option 3: Set the missing values to some value (zero the mean, the median,etc.) on the training set**

# **Set the missing values to zero**

In [10]:
data = pd.read_csv("data.csv")
data["Working Experience(years)"].fillna(0, inplace=True)
data

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types
0,3,15.0,45,250000,A
1,1,5.0,30,200000,B
2,2,10.0,38,150000,AB
3,1,0.0,36,180000,O


# **Set the missing values to Mean**

In [11]:
data["Working Experience(years)"].fillna(data["Working Experience(years)"]
                                         .mean(),inplace=True)
data

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types
0,3,15.0,45,250000,A
1,1,5.0,30,200000,B
2,2,10.0,38,150000,AB
3,1,0.0,36,180000,O


# **Set the missing values to Median**

In [12]:
data = pd.read_csv("data.csv")
data["Working Experience(years)"].fillna(data["Working Experience(years)"]
                                         .median(),inplace=True)
data

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types
0,3,15.0,45,250000,A
1,1,5.0,30,200000,B
2,2,10.0,38,150000,AB
3,1,10.0,36,180000,O


# **Sciki-Learn's Approach to replace missing values of all attributes: Imputer**

# Since the median can only be computed on numerical attributes, we need to create a copy of the data without the text attribute Blood Types

In [13]:
data = pd.read_csv("data.csv")
print(data)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
#Since the median can only be computed on numerical attributes drop Blood Types
no_of_kids = data.drop("Blood Types", axis=1) 
imputer.fit(no_of_kids)
X = imputer.transform(no_of_kids)
missingValue = pd.DataFrame(X, columns=no_of_kids.columns)
print(missingValue)

   Number of Kids  Working Experience(years)  Age  Salary Blood Types
0               3                       15.0   45  250000           A
1               1                        5.0   30  200000           B
2               2                       10.0   38  150000          AB
3               1                        NaN   36  180000           O
   Number of Kids  Working Experience(years)   Age    Salary
0             3.0                       15.0  45.0  250000.0
1             1.0                        5.0  30.0  200000.0
2             2.0                       10.0  38.0  150000.0
3             1.0                       10.0  36.0  180000.0


# **Finding Correlation between two variables**

In [14]:
# Compare to both attributes, Age attribute is more related to "Number of Kids"
data["Number of Kids"].corr(data["Age"]) 

0.9147673836616229

In [22]:
data["Number of Kids"].corr(data["Working Experience(years)"])

0.8528028654224418

# **Convertion from text categories to One-Hot Vectors**

# Step 1: Convert from text categories to integer categories

In [23]:
blood_types_encoded,blood_types_categories=data["Blood Types"].factorize()
print(blood_types_encoded)
print(blood_types_categories)

[0 1 2 3]
Index(['A', 'B', 'AB', 'O'], dtype='object')


# Step 2: Convert from integer categories to One-Hot Vectors

In [24]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
blood_type_1hot = encoder.fit_transform(blood_types_encoded.reshape(-1,1))
blood_type_1hot

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [25]:
data["Working Experience(years)"].fillna(data["Working Experience(years)"]
                                         .median(),inplace=True)
one_hot_Vector=pd.get_dummies(data,columns=["Blood Types"])
one_hot_Vector


Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types_A,Blood Types_AB,Blood Types_B,Blood Types_O
0,3,15.0,45,250000,1,0,0,0
1,1,5.0,30,200000,0,0,1,0
2,2,10.0,38,150000,0,1,0,0
3,1,10.0,36,180000,0,0,0,1


# **Feature Scaling**

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
print(scaler.fit(one_hot_Vector))
print(scaler.mean_)
print(scaler.transform(one_hot_Vector))

StandardScaler()
[1.750e+00 1.000e+01 3.725e+01 1.950e+05 2.500e-01 2.500e-01 2.500e-01
 2.500e-01]
[[ 1.50755672  1.41421356  1.44695609  1.5109662   1.73205081 -0.57735027
  -0.57735027 -0.57735027]
 [-0.90453403 -1.41421356 -1.35360408  0.13736056 -0.57735027 -0.57735027
   1.73205081 -0.57735027]
 [ 0.30151134  0.          0.14002801 -1.23624508 -0.57735027  1.73205081
  -0.57735027 -0.57735027]
 [-0.90453403  0.         -0.23338001 -0.41208169 -0.57735027 -0.57735027
  -0.57735027  1.73205081]]


# StandardScaler for Standardization.

In [27]:
#standardization does not bound values to a specific range
scaler = StandardScaler() 
Standard_Scaled = pd.DataFrame(scaler.fit_transform(one_hot_Vector),
columns=["Number of Kids","Working Experience(years)","Age","Salary",
         "Blood Types_A","Blood Types_AB","Blood Type_B","Blood Types_O"])
Standard_Scaled

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types_A,Blood Types_AB,Blood Type_B,Blood Types_O
0,1.507557,1.414214,1.446956,1.510966,1.732051,-0.57735,-0.57735,-0.57735
1,-0.904534,-1.414214,-1.353604,0.137361,-0.57735,-0.57735,1.732051,-0.57735
2,0.301511,0.0,0.140028,-1.236245,-0.57735,1.732051,-0.57735,-0.57735
3,-0.904534,0.0,-0.23338,-0.412082,-0.57735,-0.57735,-0.57735,1.732051


# Min-Max Scaling

In [28]:
 # Min-max scaling values are shifted and rescaled so that they end up ranging from 0 to 1.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
Minmax_scaled = pd.DataFrame(scaler.fit_transform(one_hot_Vector),
columns=["Number of Kids","Working Experience(years)","Age","Salary",
         "Blood Types_A","Blood Types_AB","Blood Type_B","Blood Types_O"])
Minmax_scaled

Unnamed: 0,Number of Kids,Working Experience(years),Age,Salary,Blood Types_A,Blood Types_AB,Blood Type_B,Blood Types_O
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.5,0.0,0.0,1.0,0.0
2,0.5,0.5,0.533333,0.0,0.0,1.0,0.0,0.0
3,0.0,0.5,0.4,0.3,0.0,0.0,0.0,1.0
