In [14]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io.arff import loadarff
import pandas as pd
import seaborn as sns

# 9 different datasets for classification with binary classes and numerical variables
1. 3 small datasets containing at most 10 variables
2. 6 large datasets containing more than 10 variables

## Loading Data and Removing Collinear Features

### *Small Datasets*

#### Blood Transfusion service center

In [15]:
# Loading the dataset
raw_blood_data = loadarff("./Datasets/blood-transfusion-service-center.arff")
df_blood_data = pd.DataFrame(raw_blood_data[0])

# Separating other features from target
df_blood_no_class = df_blood_data.drop('Class', axis=1)

# Removing highly correlated features
corr = df_blood_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_blood_no_class = df_blood_no_class.loc[:, mask]

# Creating X and Y
X_blood = df_blood_no_class
y_blood = df_blood_data['Class'].astype(int)

# Concatenating X and Y again
df_blood_data = pd.concat([pd.DataFrame(X_blood), pd.DataFrame(y_blood)], axis=1)


print("Summary of Blood Transfusion Service Center Dataset:")
print(df_blood_data.describe())

# sns.pairplot(df_blood_data)

Correlation Matrix of Dataset:
          V1        V2        V3        V4
V1  1.000000 -0.182745 -0.182745  0.160618
V2 -0.182745  1.000000  1.000000  0.634940
V3 -0.182745  1.000000  1.000000  0.634940
V4  0.160618  0.634940  0.634940  1.000000
Summary of Blood Transfusion Service Center Dataset:
               V1          V4       Class
count  748.000000  748.000000  748.000000
mean     9.506684   34.282086    1.237968
std      8.095396   24.376714    0.426124
min      0.000000    2.000000    1.000000
25%      2.750000   16.000000    1.000000
50%      7.000000   28.000000    1.000000
75%     14.000000   50.000000    1.000000
max     74.000000   98.000000    2.000000


#### Iris

In [16]:
# Loading dataset
raw_iris_data = loadarff("./Datasets/iris.arff")
df_iris_data = pd.DataFrame(raw_iris_data[0])

# Separating other features from target
df_iris_no_class = df_iris_data.drop('binaryClass', axis=1)

# Removing highly correlated features
corr = df_iris_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_iris_no_class = df_iris_no_class.loc[:, mask]

# Creating X and Y
X_iris = df_iris_no_class
y_iris = df_iris_data['binaryClass']

# Concatenating X and Y again
df_iris_data = pd.concat([pd.DataFrame(X_iris), pd.DataFrame(y_iris)], axis=1)

print("Summary of Iris Dataset:")
print(df_iris_data.describe())
# sns.pairplot(df_iris_data)

Correlation Matrix of Dataset:
             sepallength  sepalwidth  petallength  petalwidth
sepallength     1.000000   -0.109369     0.871754    0.817954
sepalwidth     -0.109369    1.000000    -0.420516   -0.356544
petallength     0.871754   -0.420516     1.000000    0.962757
petalwidth      0.817954   -0.356544     0.962757    1.000000
Summary of Iris Dataset:
       sepallength  sepalwidth
count   150.000000  150.000000
mean      5.843333    3.054000
std       0.828066    0.433594
min       4.300000    2.000000
25%       5.100000    2.800000
50%       5.800000    3.000000
75%       6.400000    3.300000
max       7.900000    4.400000


#### Diabetes

In [17]:
# Load diabetes dataset
raw_diabetes_data = loadarff("./Datasets/diabetes.arff")
df_diabetes_data = pd.DataFrame(raw_diabetes_data[0])

# Separating other features from target
df_diabetes_no_class = df_diabetes_data.drop('class', axis=1)


# Removing highly correlated features
corr = df_diabetes_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_diabetes_no_class = df_diabetes_no_class.loc[:, mask]

# Creating X and Y
X_diabetes = df_diabetes_no_class
y_diabetes = df_diabetes_data['class']

# Concatenating X and Y again
df_diabetes_data = pd.concat([pd.DataFrame(X_diabetes), pd.DataFrame(y_diabetes)], axis=1)

# Summary of the dataset
print("Summary of Diabetes Dataset:")
print(df_diabetes_data.describe())

# Exploratory Data Analysis
# sns.pairplot(df_diabetes_data)

Correlation Matrix of Dataset:
          preg      plas      pres      skin      insu      mass      pedi  \
preg  1.000000  0.129459  0.141282 -0.081672 -0.073535  0.017683 -0.033523   
plas  0.129459  1.000000  0.152590  0.057328  0.331357  0.221071  0.137337   
pres  0.141282  0.152590  1.000000  0.207371  0.088933  0.281805  0.041265   
skin -0.081672  0.057328  0.207371  1.000000  0.436783  0.392573  0.183928   
insu -0.073535  0.331357  0.088933  0.436783  1.000000  0.197859  0.185071   
mass  0.017683  0.221071  0.281805  0.392573  0.197859  1.000000  0.140647   
pedi -0.033523  0.137337  0.041265  0.183928  0.185071  0.140647  1.000000   
age   0.544341  0.263514  0.239528 -0.113970 -0.042163  0.036242  0.033561   

           age  
preg  0.544341  
plas  0.263514  
pres  0.239528  
skin -0.113970  
insu -0.042163  
mass  0.036242  
pedi  0.033561  
age   1.000000  
Summary of Diabetes Dataset:
             preg        plas        pres        skin        insu        mass  \
cou

#### Pollen

In [18]:
# Load pollen dataset
raw_pollen_data = loadarff("./Datasets/pollen.arff")
df_pollen_data = pd.DataFrame(raw_pollen_data[0])

# Creating a dataset without the binaryClass variable
df_pollen_no_class = df_pollen_data.drop('binaryClass', axis=1)

# Removing highly correlated features
corr = df_pollen_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.97).any()
df_pollen_no_class = df_pollen_no_class.loc[:, mask]

# Creating X and Y
X_pollen = df_pollen_no_class
y_pollen = df_pollen_data['binaryClass']

# Concatenating X and Y again
df_pollen_data = pd.concat([pd.DataFrame(X_pollen), pd.DataFrame(y_pollen)], axis=1)

# Summary of the dataset
print("Summary of Pollen Dataset:")
print(df_pollen_data.describe())

# Exploratory Data Analysis
# sns.pairplot(df_pollen_data)

Correlation Matrix of Dataset:
            RIDGE       NUB     CRACK    WEIGHT   DENSITY
RIDGE    1.000000  0.133189 -0.125529 -0.896471 -0.565084
NUB      0.133189  1.000000  0.082869 -0.171585  0.327398
CRACK   -0.125529  0.082869  1.000000  0.268195 -0.152649
WEIGHT  -0.896471 -0.171585  0.268195  1.000000  0.242196
DENSITY -0.565084  0.327398 -0.152649  0.242196  1.000000
Summary of Pollen Dataset:
             RIDGE          NUB        CRACK       WEIGHT      DENSITY
count  3848.000000  3848.000000  3848.000000  3848.000000  3848.000000
mean     -0.003637     0.000160     0.003103     0.004237     0.000166
std       6.398237     5.186311     7.875199    10.043092     3.144395
min     -23.283900   -16.393500   -31.413000   -34.035200   -12.039100
25%      -3.983725    -3.757625    -5.453275    -7.018650    -2.132450
50%      -0.163850    -0.231700    -0.056150    -0.149350    -0.030450
75%       4.647150     3.750525     5.661125     6.799800     2.028625
max      21.406600    17.2

### Large Datasets

#### KC2 

In [19]:
# Load kc2 dataset
raw_kc2_data = loadarff("./Datasets/kc2.arff")
df_kc2_data = pd.DataFrame(raw_kc2_data[0])

# Creating a dataset without the 'problems' variable
df_kc2_no_class = df_kc2_data.drop('problems', axis=1)

# Removing highly correlated features
corr = df_kc2_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.97).any()
df_kc2_no_class = df_kc2_no_class.loc[:, mask]

# Creating X and Y
X_kc2 = df_kc2_no_class
y_kc2 = df_kc2_data['problems']

# Concatenating X and Y again
df_kc2_data = pd.concat([pd.DataFrame(X_kc2), pd.DataFrame(y_kc2)], axis=1)
# Summary of the dataset
print("Summary of KC2 Dataset:")
print(df_kc2_data.describe())

# Exploratory Data Analysis
# sns.pairplot(df_kc2_data)

Correlation Matrix of Dataset:
                       loc      v(g)     ev(g)     iv(g)         n         v  \
loc               1.000000  0.964048  0.902863  0.963517  0.989529  0.975357   
v(g)              0.964048  1.000000  0.927526  0.976598  0.963279  0.958553   
ev(g)             0.902863  0.927526  1.000000  0.952305  0.912839  0.934120   
iv(g)             0.963517  0.976598  0.952305  1.000000  0.966458  0.970448   
n                 0.989529  0.963279  0.912839  0.966458  1.000000  0.991431   
v                 0.975357  0.958553  0.934120  0.970448  0.991431  1.000000   
l                -0.336253 -0.288152 -0.188257 -0.269807 -0.311270 -0.248429   
d                 0.805374  0.747592  0.641356  0.734147  0.784555  0.710863   
i                 0.890050  0.834545  0.732302  0.812713  0.885379  0.840885   
e                 0.918543  0.919532  0.939698  0.949862  0.939851  0.969166   
b                 0.967499  0.951610  0.928172  0.963684  0.983898  0.993026   
t        

#### PC3

In [20]:
raw_pc3_data = loadarff("./Datasets/pc3.arff")
df_pc3_data = pd.DataFrame(raw_pc3_data[0])
df_pc3_no_class = df_pc3_data.drop('c', axis=1)

# Removing highly correlated features
corr = df_pc3_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_pc3_no_class = df_pc3_no_class.loc[:, mask]

# Creating X and Y
X_pc3 = df_pc3_no_class
y_pc3 = df_pc3_data['c']

# Concatenating X and Y again
df_pc3_data = pd.concat([pd.DataFrame(X_pc3), pd.DataFrame(y_pc3)], axis=1)

print("Summary of PC3 Dataset:")
print(df_pc3_data.describe())
# sns.pairplot(df_pc3_data)

Correlation Matrix of Dataset:
                                 LOC_BLANK  BRANCH_COUNT  CALL_PAIRS  \
LOC_BLANK                         1.000000      0.319549    0.457082   
BRANCH_COUNT                      0.319549      1.000000    0.354603   
CALL_PAIRS                        0.457082      0.354603    1.000000   
LOC_CODE_AND_COMMENT              0.558268      0.249042    0.278199   
LOC_COMMENTS                      0.674951      0.188476    0.346894   
CONDITION_COUNT                   0.294937      0.989614    0.320521   
CYCLOMATIC_COMPLEXITY             0.328669      0.996814    0.369096   
CYCLOMATIC_DENSITY               -0.270265      0.103854   -0.210782   
DECISION_COUNT                    0.281722      0.980903    0.310616   
DECISION_DENSITY                  0.260679      0.243500    0.236261   
DESIGN_COMPLEXITY                 0.314185      0.564452    0.646359   
DESIGN_DENSITY                   -0.173296     -0.295396    0.104816   
EDGE_COUNT                       

#### Madelon

In [21]:
raw_madelon_data = loadarff("./Datasets/madelon.arff")
df_madelon_data = pd.DataFrame(raw_madelon_data[0])
df_madelon_no_class = df_madelon_data.drop('Class', axis=1)

# Removing highly correlated features
corr = df_madelon_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_madelon_no_class = df_madelon_no_class.loc[:, mask]

# Creating X and Y
X_madelon = df_madelon_no_class
y_madelon = df_madelon_data['Class'].astype(int)

# Concatenating X and Y again
df_madelon_data = pd.concat([pd.DataFrame(X_madelon), pd.DataFrame(y_madelon)], axis=1)

print("Summary of Madelon Dataset:")
print(df_madelon_data.describe())
# sns.pairplot(df_madelon_data)

Correlation Matrix of Dataset:
            V1        V2        V3        V4        V5        V6        V7  \
V1    1.000000  0.020146 -0.032128 -0.004837 -0.008465  0.017893  0.032264   
V2    0.020146  1.000000 -0.013731  0.007702  0.024493 -0.013321 -0.019353   
V3   -0.032128 -0.013731  1.000000  0.008097  0.000298 -0.004214  0.010097   
V4   -0.004837  0.007702  0.008097  1.000000 -0.003623  0.018354  0.000226   
V5   -0.008465  0.024493  0.000298 -0.003623  1.000000 -0.009436  0.001218   
...        ...       ...       ...       ...       ...       ...       ...   
V496 -0.035144  0.019448  0.048080  0.020029  0.014977  0.009258  0.004995   
V497  0.012457  0.040476  0.002344 -0.010481  0.051843  0.015354  0.021644   
V498 -0.025676 -0.004938  0.028050  0.002225 -0.001291  0.026771 -0.021079   
V499  0.012710  0.003167  0.018470 -0.022801  0.033795 -0.013333  0.027172   
V500 -0.032624  0.018204  0.018266  0.006716  0.014600  0.015195  0.005726   

            V8        V9       V

#### Elevators

In [22]:
raw_elevators_data = loadarff("./Datasets/elevators.arff")
df_elevators_data = pd.DataFrame(raw_elevators_data[0])
df_elevators_no_class = df_elevators_data.drop('binaryClass', axis=1)

# Removing highly correlated features
corr = df_elevators_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_elevators_no_class = df_elevators_no_class.loc[:, mask]

# Creating X and Y
X_elevators = df_elevators_no_class
y_elevators = df_elevators_data['binaryClass']

# Concatenating X and Y again
df_elevators_data = pd.concat([pd.DataFrame(X_elevators), pd.DataFrame(y_elevators)], axis=1)

print("Summary of Elevators Dataset:")
print(df_elevators_data.describe())
# sns.pairplot(df_elevators_data)

Correlation Matrix of Dataset:
              climbRate       Sgz         p         q   curRoll   absRoll  \
climbRate      1.000000 -0.013900  0.140025 -0.078170 -0.146812 -0.023482   
Sgz           -0.013900  1.000000  0.093184  0.351302  0.017043  0.077119   
p              0.140025  0.093184  1.000000  0.132396  0.006603  0.022430   
q             -0.078170  0.351302  0.132396  1.000000  0.051430 -0.390996   
curRoll       -0.146812  0.017043  0.006603  0.051430  1.000000  0.194465   
absRoll       -0.023482  0.077119  0.022430 -0.390996  0.194465  1.000000   
diffClb        0.187824 -0.438364 -0.117594 -0.819379 -0.158177 -0.035130   
diffRollRate   0.198840 -0.036570  0.203355 -0.179879 -0.654030 -0.144497   
diffDiffClb   -0.246951 -0.022099 -0.126757 -0.350853  0.024950 -0.004565   
SaTime1        0.094790  0.062284  0.102467 -0.438460 -0.230882  0.683132   
SaTime2        0.104038  0.070309  0.170397 -0.424942 -0.227539  0.678935   
SaTime3        0.104072  0.070308  0.170487 -

#### Japanese Vowels

In [23]:
raw_japanese_vowels_data = loadarff("./Datasets/japanese_vowels.arff")
df_japanese_vowels_data = pd.DataFrame(raw_japanese_vowels_data[0])
df_japanese_vowels_no_class = df_japanese_vowels_data.drop('binaryClass', axis=1)

# Removing highly correlated features
corr = df_japanese_vowels_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_japanese_vowels_no_class = df_japanese_vowels_no_class.loc[:, mask]

# Creating X and Y
X_japanese_vowels = df_japanese_vowels_no_class
y_japanese_vowels = df_japanese_vowels_data['binaryClass']

# Concatenating X and Y again
df_japanese_vowels_data = pd.concat([pd.DataFrame(X_japanese_vowels), pd.DataFrame(y_japanese_vowels)], axis=1)

print("Summary of Japanese Vowels Dataset:")
print(df_japanese_vowels_data.describe())
# sns.pairplot(df_japanese_vowels_data)

Correlation Matrix of Dataset:
               utterance     frame  coefficient1  coefficient2  coefficient3  \
utterance       1.000000 -0.066662     -0.241294      0.202914     -0.092642   
frame          -0.066662  1.000000     -0.254961     -0.338998      0.353423   
coefficient1   -0.241294 -0.254961      1.000000     -0.072982     -0.005785   
coefficient2    0.202914 -0.338998     -0.072982      1.000000     -0.525940   
coefficient3   -0.092642  0.353423     -0.005785     -0.525940      1.000000   
coefficient4   -0.125501  0.661498      0.066370     -0.304005      0.235137   
coefficient5    0.149719  0.578038     -0.445158      0.056637     -0.058062   
coefficient6   -0.272809  0.442318      0.315648     -0.336065      0.249107   
coefficient7    0.099232 -0.296055     -0.050390      0.388680     -0.242847   
coefficient8    0.048431 -0.661881      0.192802      0.188448     -0.011811   
coefficient9   -0.069479 -0.276070      0.169102      0.479753     -0.407619   
coefficie

#### Vehicles

In [24]:
raw_vehicle_data = loadarff("./Datasets/vehicle.arff")
df_vehicle_data = pd.DataFrame(raw_vehicle_data[0])
df_vehicle_no_class = df_vehicle_data.drop('binaryClass', axis=1)

# Removing highly correlated features
corr = df_vehicle_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_vehicle_no_class = df_vehicle_no_class.loc[:, mask]

# Creating X and Y
X_vehicle = df_vehicle_no_class
y_vehicle = df_vehicle_data['binaryClass']

# Concatenating X and Y again
df_vehicle_data = pd.concat([pd.DataFrame(X_vehicle), pd.DataFrame(y_vehicle)], axis=1)

print("Summary of Vehicle Dataset:")
print(df_vehicle_data.describe())
# sns.pairplot(df_vehicle_data)

Correlation Matrix of Dataset:
                           COMPACTNESS  CIRCULARITY  DISTANCE CIRCULARITY  \
COMPACTNESS                   1.000000     0.692869              0.792444   
CIRCULARITY                   0.692869     1.000000              0.798492   
DISTANCE CIRCULARITY          0.792444     0.798492              1.000000   
RADIUS RATIO                  0.691659     0.622778              0.771644   
PR.AXIS ASPECT RATIO          0.093222     0.149692              0.161529   
MAX.LENGTH ASPECT RATIO       0.148249     0.247467              0.264309   
SCATTER RATIO                 0.813003     0.860367              0.907280   
ELONGATEDNESS                -0.788647    -0.828755             -0.912307   
PR.AXIS RECTANGULARITY        0.813437     0.857925              0.895326   
MAX.LENGTH RECTANGULARITY     0.676143     0.965776              0.774524   
SCALED VARIANCE_MAJOR         0.764415     0.808496              0.864432   
SCALED VARIANCE_MINOR         0.818632     0.

#### Steel plate's faults

In [25]:
raw_steel_data = loadarff("./Datasets/phpOJxGL9.arff")
df_steel_data = pd.DataFrame(raw_steel_data[0])
df_steel_data[df_steel_data['V2']==b'Female']=1

df_steel_data[df_steel_data['V2']==b'Male']=0
df_steel_no_class = df_steel_data.drop('Class', axis=1)

# Removing highly correlated features
corr = df_steel_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_steel_no_class = df_steel_no_class.loc[:, mask]

# Creating X and Y
X_steel = df_steel_no_class
y_steel = df_steel_data['Class']

# Concatenating X and Y again
df_steel_data = pd.concat([pd.DataFrame(X_steel), pd.DataFrame(y_steel)], axis=1)

print("Summary of steel Dataset:")
print(df_steel_data.describe())
# sns.pairplot(df_steel_data)

Correlation Matrix of Dataset:
      V1   V2   V3   V4   V5   V6   V7   V8   V9  V10
V1   1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0
V2   1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0
V3   1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0
V4   1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0
V5   1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0
V6   1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0
V7   1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0
V8   1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0
V9   1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0
V10  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0
Summary of steel Dataset:
        Class
count     583
unique      2
top         0
freq      441


#### Biodegradable chemicals

In [26]:
raw_biodegradable_data = loadarff("./Datasets/phpGUrE90.arff")
df_biodegradable_data = pd.DataFrame(raw_biodegradable_data[0])
df_biodegradable_no_class = df_biodegradable_data.drop('Class', axis=1)

# Removing highly correlated features
corr = df_biodegradable_no_class.corr()
print("Correlation Matrix of Dataset:")
print(corr)
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_biodegradable_no_class = df_biodegradable_no_class.loc[:, mask]

# Creating X and Y
X_biodegradable = df_biodegradable_no_class
y_biodegradable = df_biodegradable_data['Class']

# Concatenating X and Y again
df_biodegradable_data = pd.concat([pd.DataFrame(X_biodegradable), pd.DataFrame(y_biodegradable)], axis=1)

print("Summary of biodegradable Dataset:")
print(df_biodegradable_data.describe())
# sns.pairplot(df_biodegradable_data)

Correlation Matrix of Dataset:
           V1        V2        V3        V4        V5        V6        V7  \
V1   1.000000  0.232683  0.261817  0.022219  0.185560  0.446789  0.423223   
V2   0.232683  1.000000  0.067649 -0.094366 -0.186095  0.351183 -0.260477   
V3   0.261817  0.067649  1.000000  0.022182  0.003350  0.068106  0.258438   
V4   0.022219 -0.094366  0.022182  1.000000  0.449388  0.006723  0.191410   
V5   0.185560 -0.186095  0.003350  0.449388  1.000000  0.001928  0.372453   
V6   0.446789  0.351183  0.068106  0.006723  0.001928  1.000000  0.079107   
V7   0.423223 -0.260477  0.258438  0.191410  0.372453  0.079107  1.000000   
V8   0.381464 -0.300082  0.097963 -0.038907  0.146439 -0.062045  0.599244   
V9   0.202214  0.150766 -0.122079  0.024662  0.007436  0.450833  0.135601   
V10  0.260533  0.192826 -0.106941  0.033042  0.086099  0.042820  0.115968   
V11  0.183770 -0.141810 -0.016309  0.424914  0.870518 -0.020331  0.333183   
V12 -0.105155 -0.192385  0.059623 -0.052004 -

## Filling Missing Values

In [27]:
# No missing values present in the datasets
print("Number of missing values for X_blood: ", np.isnan(X_blood).sum())
print("Number of missing values for X_iris: ", np.isnan(X_iris).sum())
print("Number of missing values for X_diabetes: ", np.isnan(X_diabetes).sum())
print("Number of missing values for X_kc2: ", np.isnan(X_kc2).sum())
print("Number of missing values for X_pollen: ", np.isnan(X_pollen).sum())
print("Number of missing values for X_pc3: ", np.isnan(X_pc3).sum())
print("Number of missing values for X_madelon: ", np.isnan(X_madelon).sum())
print("Number of missing values for X_biodegradable: ", np.isnan(X_biodegradable).sum())
print("Number of missing values for X_japanese_vowels: ", np.isnan(X_japanese_vowels).sum())
print("Number of missing values for X_vehicle: ", np.isnan(X_vehicle).sum())

Number of missing values for X_blood:  V1    0
V4    0
dtype: int64
Number of missing values for X_iris:  sepallength    0
sepalwidth     0
dtype: int64
Number of missing values for X_diabetes:  preg    0
plas    0
pres    0
skin    0
insu    0
mass    0
pedi    0
age     0
dtype: int64
Number of missing values for X_kc2:  ev(g)               0
l                   0
d                   0
i                   0
lOComment           0
lOBlank             0
lOCodeAndComment    0
uniq_Op             0
uniq_Opnd           0
dtype: int64
Number of missing values for X_pollen:  RIDGE      0
NUB        0
CRACK      0
WEIGHT     0
DENSITY    0
dtype: int64
Number of missing values for X_pc3:  LOC_BLANK               0
CALL_PAIRS              0
LOC_CODE_AND_COMMENT    0
LOC_COMMENTS            0
DECISION_DENSITY        0
DESIGN_COMPLEXITY       0
DESIGN_DENSITY          0
ESSENTIAL_COMPLEXITY    0
ESSENTIAL_DENSITY       0
PARAMETER_COUNT         0
HALSTEAD_CONTENT        0
HALSTEAD_DIFFICULTY    