In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

# Droping the columns based on 50% Null values

In [2]:
# index_col = False --> not to use the first col as index
cbc = pd.read_csv("ResearchCBC_data_final.csv", index_col=False)
cbc

Unnamed: 0,Code,Nickname,Analyzer ID,Date,Time,Rack,Position,Sample No.,Sample Inf.,Order Type,...,[MN%(%)],[MN%/M],[PMN%(%)],[PMN%/M],[HF%(/100WBC)],[HF%/M],[TC#(10^3/uL)],[TC#/M],[HPC%(%)],[HPC%/M]
0,99,XN-1000-1-A,XN-20^11551,10/7/2017,10:28:14,1,2.0,NBD/1617/021437,M,Initial,...,,,,,,,,,,
1,99,XN-1000-1-A,XN-20^11551,9/1/2014,11:18:16,17,1.0,3D-5,M,Initial,...,,,,,,,,,,
2,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:49:23,1,1.0,ERR000000000001,M,Initial,...,,,,,,,,,,
3,99,XN-1000-1-A,XN-20^11551,6/29/2016,10:57:59,16,1.0,3D-143,M,Initial,...,,,,,,,,,,
4,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:56:06,2,1.0,3D-69,M,Initial,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1,XN-1000-1-A,XN-20^11551,1/13/2014,10:47:03,11,1.0,3D-62,M,Initial,...,,,,,,,,,,
1413,1,XN-1000-1-A,XN-20^11551,1/10/2014,10:11:38,9,1.0,3D-52,M,Initial,...,,,,,,,,,,
1414,1,XN-1000-1-A,XN-20^11551,1/8/2014,10:06:16,7,1.0,3D-40,M,Initial,...,,,,,,,,,,
1415,1,XN-1000-1-A,XN-20^11551,1/6/2014,11:38:28,19,2.0,3D-28,A,Initial,...,,,,,,,,,,


### Saving Null values percentage of each column in csv

In [3]:
print(cbc.isna().sum(axis=0))

Code                 0
Nickname             0
Analyzer ID          0
Date                 0
Time                 0
                  ... 
[HF%/M]           1417
[TC#(10^3/uL)]       0
[TC#/M]           1417
[HPC%/M]          1417
Length: 434, dtype: int64


In [4]:
# Getting percentage of null values in each column
print(((cbc.isna().sum(axis=0))/1417)*100)

Code                0.0
Nickname            0.0
Analyzer ID         0.0
Date                0.0
Time                0.0
                  ...  
[HF%/M]           100.0
[TC#(10^3/uL)]      0.0
[TC#/M]           100.0
[HPC%/M]          100.0
Length: 434, dtype: float64


In [5]:
percen_total_jiya = ((cbc.isna().sum(axis=0))/1417)*100

In [6]:
columns_name = cbc.columns
columns_name

Index(['Code', 'Nickname', 'Analyzer ID', 'Date', 'Time', 'Rack', 'Position',
       'Sample No.', 'Sample Inf.', 'Order Type',
       ...
      dtype='object', length=434)

In [7]:
percen_total_jiya.to_csv("percen_total_jiya.csv")

### Droping columns based on the percentage of Null values

In [8]:
# Drop column having max 50% null value
thresh = len(cbc) * .5
cbc.dropna(thresh = thresh, axis = 1, inplace = True)

### Saving Null values percentage of each column in csv after dropping columns

In [9]:
percen_after_jiya = ((cbc.isna().sum(axis=0))/1417)*100

In [10]:
percen_after_jiya.to_csv("percen_after_jiya.csv")

In [11]:
percen_after_jiya

Code              0.0
Nickname          0.0
Analyzer ID       0.0
Date              0.0
Time              0.0
                 ... 
[HF%(/100WBC)]    0.0
[TC#(10^3/uL)]    0.0
Length: 181, dtype: float64

In [12]:
cbc.to_csv('fifty_percen_dataset.csv', index='False')

# Reading the processed CSV (that drop columns with > 50% Null Values

In [13]:
dataset = pd.read_csv('fifty_percen_dataset.csv')
dataset

Unnamed: 0.1,Unnamed: 0,Code,Nickname,Analyzer ID,Date,Time,Rack,Position,Sample No.,Sample Inf.,...,[MONO%(%)],[EO%(%)],[MN#(10^3/uL)],[PMN#(10^3/uL)],[HF#(10^3/uL)],[MN%(%)],[PMN%(%)],[HF%(/100WBC)],[TC#(10^3/uL)],[HPC%(%)]
0,0,99,XN-1000-1-A,XN-20^11551,10/7/2017,10:28:14,1,2.0,NBD/1617/021437,M,...,,,,,,,,,,
1,1,99,XN-1000-1-A,XN-20^11551,9/1/2014,11:18:16,17,1.0,3D-5,M,...,,,,,,,,,,
2,2,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:49:23,1,1.0,ERR000000000001,M,...,,,,,,,,,,
3,3,99,XN-1000-1-A,XN-20^11551,6/29/2016,10:57:59,16,1.0,3D-143,M,...,,,,,,,,,,
4,4,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:56:06,2,1.0,3D-69,M,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1412,1,XN-1000-1-A,XN-20^11551,1/13/2014,10:47:03,11,1.0,3D-62,M,...,,,,,,,,,,
1413,1413,1,XN-1000-1-A,XN-20^11551,1/10/2014,10:11:38,9,1.0,3D-52,M,...,,,,,,,,,,
1414,1414,1,XN-1000-1-A,XN-20^11551,1/8/2014,10:06:16,7,1.0,3D-40,M,...,,,,,,,,,,
1415,1415,1,XN-1000-1-A,XN-20^11551,1/6/2014,11:38:28,19,2.0,3D-28,A,...,,,,,,,,,,


In [14]:
cbc = dataset.iloc[:, 1:]
cbc

Unnamed: 0,Code,Nickname,Analyzer ID,Date,Time,Rack,Position,Sample No.,Sample Inf.,Order Type,...,[MONO%(%)],[EO%(%)],[MN#(10^3/uL)],[PMN#(10^3/uL)],[HF#(10^3/uL)],[MN%(%)],[PMN%(%)],[HF%(/100WBC)],[TC#(10^3/uL)],[HPC%(%)]
0,99,XN-1000-1-A,XN-20^11551,10/7/2017,10:28:14,1,2.0,NBD/1617/021437,M,Initial,...,,,,,,,,,,
1,99,XN-1000-1-A,XN-20^11551,9/1/2014,11:18:16,17,1.0,3D-5,M,Initial,...,,,,,,,,,,
2,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:49:23,1,1.0,ERR000000000001,M,Initial,...,,,,,,,,,,
3,99,XN-1000-1-A,XN-20^11551,6/29/2016,10:57:59,16,1.0,3D-143,M,Initial,...,,,,,,,,,,
4,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:56:06,2,1.0,3D-69,M,Initial,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1,XN-1000-1-A,XN-20^11551,1/13/2014,10:47:03,11,1.0,3D-62,M,Initial,...,,,,,,,,,,
1413,1,XN-1000-1-A,XN-20^11551,1/10/2014,10:11:38,9,1.0,3D-52,M,Initial,...,,,,,,,,,,
1414,1,XN-1000-1-A,XN-20^11551,1/8/2014,10:06:16,7,1.0,3D-40,M,Initial,...,,,,,,,,,,
1415,1,XN-1000-1-A,XN-20^11551,1/6/2014,11:38:28,19,2.0,3D-28,A,Initial,...,,,,,,,,,,


In [15]:
cbc = cbc.reset_index(drop=True)
cbc

Unnamed: 0,Code,Nickname,Analyzer ID,Date,Time,Rack,Position,Sample No.,Sample Inf.,Order Type,...,[MONO%(%)],[EO%(%)],[MN#(10^3/uL)],[PMN#(10^3/uL)],[HF#(10^3/uL)],[MN%(%)],[PMN%(%)],[HF%(/100WBC)],[TC#(10^3/uL)],[HPC%(%)]
0,99,XN-1000-1-A,XN-20^11551,10/7/2017,10:28:14,1,2.0,NBD/1617/021437,M,Initial,...,,,,,,,,,,
1,99,XN-1000-1-A,XN-20^11551,9/1/2014,11:18:16,17,1.0,3D-5,M,Initial,...,,,,,,,,,,
2,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:49:23,1,1.0,ERR000000000001,M,Initial,...,,,,,,,,,,
3,99,XN-1000-1-A,XN-20^11551,6/29/2016,10:57:59,16,1.0,3D-143,M,Initial,...,,,,,,,,,,
4,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:56:06,2,1.0,3D-69,M,Initial,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1,XN-1000-1-A,XN-20^11551,1/13/2014,10:47:03,11,1.0,3D-62,M,Initial,...,,,,,,,,,,
1413,1,XN-1000-1-A,XN-20^11551,1/10/2014,10:11:38,9,1.0,3D-52,M,Initial,...,,,,,,,,,,
1414,1,XN-1000-1-A,XN-20^11551,1/8/2014,10:06:16,7,1.0,3D-40,M,Initial,...,,,,,,,,,,
1415,1,XN-1000-1-A,XN-20^11551,1/6/2014,11:38:28,19,2.0,3D-28,A,Initial,...,,,,,,,,,,


In [16]:
# Removing unnamed columns
cbc = cbc.loc[:, ~cbc.columns.str.contains('^Unnamed')]
cbc

Unnamed: 0,Code,Nickname,Analyzer ID,Date,Time,Rack,Position,Sample No.,Sample Inf.,Order Type,...,[MONO%(%)],[EO%(%)],[MN#(10^3/uL)],[PMN#(10^3/uL)],[HF#(10^3/uL)],[MN%(%)],[PMN%(%)],[HF%(/100WBC)],[TC#(10^3/uL)],[HPC%(%)]
0,99,XN-1000-1-A,XN-20^11551,10/7/2017,10:28:14,1,2.0,NBD/1617/021437,M,Initial,...,,,,,,,,,,
1,99,XN-1000-1-A,XN-20^11551,9/1/2014,11:18:16,17,1.0,3D-5,M,Initial,...,,,,,,,,,,
2,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:49:23,1,1.0,ERR000000000001,M,Initial,...,,,,,,,,,,
3,99,XN-1000-1-A,XN-20^11551,6/29/2016,10:57:59,16,1.0,3D-143,M,Initial,...,,,,,,,,,,
4,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:56:06,2,1.0,3D-69,M,Initial,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1,XN-1000-1-A,XN-20^11551,1/13/2014,10:47:03,11,1.0,3D-62,M,Initial,...,,,,,,,,,,
1413,1,XN-1000-1-A,XN-20^11551,1/10/2014,10:11:38,9,1.0,3D-52,M,Initial,...,,,,,,,,,,
1414,1,XN-1000-1-A,XN-20^11551,1/8/2014,10:06:16,7,1.0,3D-40,M,Initial,...,,,,,,,,,,
1415,1,XN-1000-1-A,XN-20^11551,1/6/2014,11:38:28,19,2.0,3D-28,A,Initial,...,,,,,,,,,,


In [17]:
# Already dropped column
# ['Error(Func.)' 'Error(Result)' 'Rule Result' 'Action Message (Check)'\n 'Birth' 'Sex' 
# 'Patient Comment' 'Ward Name' 'Doctor Name']

cbc = cbc.drop(['Nickname', 'Analyzer ID', 'Date', 'Time', 'Rack', 'Position',
                     'Sample No.', 'Sample Inf.', 'Order Type', 'Reception Date',
                     'Measurement Mode', 'Patient ID', 'Analysis Info.', 'Order Info.', 
                     'WBC Info.', 'PLT Info.','Validate', 'Validator', 'Action Message (Review)',
                     'Action Message (Retest)', 'Sample Comment', 'Patient Name',
                     'Output', 'Sequence No.', 'Discrete', 'Q-Flag(Blasts/Abn Lympho?)',
                     'Q-Flag(Blasts?)', 'Q-Flag(Abn Lympho?)'], axis=1)

In [18]:
cbc

Unnamed: 0,Code,Judgment,Positive(Morph.),Positive(Count),RBC Abnormal,PLT Abnormal,Q-Flag(Left Shift?),Q-Flag(Atypical Lympho?),Q-Flag(RBC Agglutination?),Q-Flag(Turbidity/HGB Interf?),...,[MONO%(%)],[EO%(%)],[MN#(10^3/uL)],[PMN#(10^3/uL)],[HF#(10^3/uL)],[MN%(%)],[PMN%(%)],[HF%(/100WBC)],[TC#(10^3/uL)],[HPC%(%)]
0,99,Negative,,,,,10,0,60,90,...,,,,,,,,,,
1,99,Negative,,,,,0,60,70,90,...,,,,,,,,,,
2,99,Negative,,,,,0,0,60,90,...,,,,,,,,,,
3,99,Negative,,,,,0,30,60,80,...,,,,,,,,,,
4,99,Negative,,,,,0,0,60,90,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1,Positive,Morph.,Count,1.0,1.0,0,40,80,100,...,,,,,,,,,,
1413,1,Positive,Morph.,Count,1.0,1.0,10,0,70,90,...,,,,,,,,,,
1414,1,Positive,Morph.,Count,1.0,1.0,0,30,50,80,...,,,,,,,,,,
1415,1,Positive,Morph.,Count,1.0,1.0,0,10,40,80,...,,,,,,,,,,


In [19]:
# Dropping those 100% null columns which are not dropped automatically
cbc = cbc.drop(['WBC-BF(10^3/uL)','RBC-BF(10^6/uL)','MN#(10^3/uL)','PMN#(10^3/uL)','MN%(%)','PMN%(%)','TC-BF#(10^3/uL)','[HF-BF#(10^3/uL)]','[HF-BF%(/100WBC)]','[NE-BF#(10^3/uL)]','[NE-BF%(%)]','[LY-BF#(10^3/uL)]','[LY-BF%(%)]','[MO-BF#(10^3/uL)]','[MO-BF%(%)]','[EO-BF#(10^3/uL)]','[EO-BF%(%)]','[RBC-BF2(10^6/uL)]','HPC#(10^3/uL)','[EO%(%)]','[MN#(10^3/uL)]','[PMN#(10^3/uL)]','[WBC(10^3/uL)]','[RBC(10^6/uL)]','[RBC-I(10^6/uL)]','[RBC-O(10^6/uL)]','[NEUT#(10^3/uL)]','[LYMPH#(10^3/uL)]','[MONO#(10^3/uL)]','[EO#(10^3/uL)]','[NEUT%(%)]','[LYMPH%(%)]','[MONO%(%)]','[HF#(10^3/uL)]','[MN%(%)]','[PMN%(%)]','[HF%(/100WBC)]','[TC#(10^3/uL)]','[HPC%(%)]'], axis=1)

In [20]:
cbc

Unnamed: 0,Code,Judgment,Positive(Morph.),Positive(Count),RBC Abnormal,PLT Abnormal,Q-Flag(Left Shift?),Q-Flag(Atypical Lympho?),Q-Flag(RBC Agglutination?),Q-Flag(Turbidity/HGB Interf?),...,[RET-UPP],[RET-TNC],[PLT-F(10^3/uL)],[H-IPF(%)],[IPF#(10^3/uL)],[HGB-O(g/dL)],[PLT-F2(10^3/uL)],Q-Flag(pRBC?),[Delta-HGB(g/dL)],[MCHC-O(g/dL)]
0,99,Negative,,,,,10,0,60,90,...,0,70,345,0.2,2.4,13.2,149.4,,,
1,99,Negative,,,,,0,60,70,90,...,1,62,250,0.5,4.3,11.7,224,,,
2,99,Negative,,,,,0,0,60,90,...,0,54,311,0.8,8.1,10.6,293.8,,,
3,99,Negative,,,,,0,30,60,80,...,1,47,202,1.6,9.9,11.3,283.9,,,
4,99,Negative,,,,,0,0,60,90,...,0,51,311,0.7,8.4,12.2,108.9,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1,Positive,Morph.,Count,1.0,1.0,0,40,80,100,...,18,40,12,0.6,0.3,7.3,221,0.0,0.8,26.1
1413,1,Positive,Morph.,Count,1.0,1.0,10,0,70,90,...,89,697,15,2.5,0.9,12.3,335.8,0.0,0.2,32.9
1414,1,Positive,Morph.,Count,1.0,1.0,0,30,50,80,...,0,82,245,2.8,18.1,5,51.4,0.0,0.8,30.7
1415,1,Positive,Morph.,Count,1.0,1.0,0,10,40,80,...,0,80,232,1.1,8.8,7.9,278.4,0.0,0.3,28.9


In [21]:
cbc.to_csv('For_categorical.csv', index=False)

In [22]:
catag_data = ['RBC Abnormal','PLT Abnormal']

# Filling null values in categorical data with 0
cbc[catag_data] = cbc[catag_data].fillna(value=0)
#cbc.to_csv("C:\\Users\\User\\Desktop\\copied folder\\ModelTraning (different synthetic data)\\temporary\\catag.csv", index=False)
cbc

Unnamed: 0,Code,Judgment,Positive(Morph.),Positive(Count),RBC Abnormal,PLT Abnormal,Q-Flag(Left Shift?),Q-Flag(Atypical Lympho?),Q-Flag(RBC Agglutination?),Q-Flag(Turbidity/HGB Interf?),...,[RET-UPP],[RET-TNC],[PLT-F(10^3/uL)],[H-IPF(%)],[IPF#(10^3/uL)],[HGB-O(g/dL)],[PLT-F2(10^3/uL)],Q-Flag(pRBC?),[Delta-HGB(g/dL)],[MCHC-O(g/dL)]
0,99,Negative,,,0.0,0.0,10,0,60,90,...,0,70,345,0.2,2.4,13.2,149.4,,,
1,99,Negative,,,0.0,0.0,0,60,70,90,...,1,62,250,0.5,4.3,11.7,224,,,
2,99,Negative,,,0.0,0.0,0,0,60,90,...,0,54,311,0.8,8.1,10.6,293.8,,,
3,99,Negative,,,0.0,0.0,0,30,60,80,...,1,47,202,1.6,9.9,11.3,283.9,,,
4,99,Negative,,,0.0,0.0,0,0,60,90,...,0,51,311,0.7,8.4,12.2,108.9,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1,Positive,Morph.,Count,1.0,1.0,0,40,80,100,...,18,40,12,0.6,0.3,7.3,221,0.0,0.8,26.1
1413,1,Positive,Morph.,Count,1.0,1.0,10,0,70,90,...,89,697,15,2.5,0.9,12.3,335.8,0.0,0.2,32.9
1414,1,Positive,Morph.,Count,1.0,1.0,0,30,50,80,...,0,82,245,2.8,18.1,5,51.4,0.0,0.8,30.7
1415,1,Positive,Morph.,Count,1.0,1.0,0,10,40,80,...,0,80,232,1.1,8.8,7.9,278.4,0.0,0.3,28.9


In [23]:
# Mapping Judgment with integer value
cbc['Judgment'] = cbc['Judgment'].map({'Positive': 1, 'Negative': 0})

In [24]:
# Mapping Positive (Morpht.) with integer value
cbc['Positive(Morph.)'] = cbc['Positive(Morph.)'].map({'Morph.': 1})

In [25]:
# Mapping Positive (Count) with integer value
cbc['Positive(Count)'] = cbc['Positive(Count)'].map({'Count': 1})

In [26]:
# replacing all NaN vallues by 0 as per suggested by the domain expert
cbc[['Judgment', 'Positive(Count)', 'Positive(Morph.)']] = cbc[
        ['Judgment', 'Positive(Count)', 'Positive(Morph.)']].fillna(value=0)
    
#cbc.to_csv("C:\\Users\\User\\Desktop\\copied folder\\ModelTraning (different synthetic data)\\temporary\\verbal_catag.csv", index=False)
cbc

Unnamed: 0,Code,Judgment,Positive(Morph.),Positive(Count),RBC Abnormal,PLT Abnormal,Q-Flag(Left Shift?),Q-Flag(Atypical Lympho?),Q-Flag(RBC Agglutination?),Q-Flag(Turbidity/HGB Interf?),...,[RET-UPP],[RET-TNC],[PLT-F(10^3/uL)],[H-IPF(%)],[IPF#(10^3/uL)],[HGB-O(g/dL)],[PLT-F2(10^3/uL)],Q-Flag(pRBC?),[Delta-HGB(g/dL)],[MCHC-O(g/dL)]
0,99,0,0.0,0.0,0.0,0.0,10,0,60,90,...,0,70,345,0.2,2.4,13.2,149.4,,,
1,99,0,0.0,0.0,0.0,0.0,0,60,70,90,...,1,62,250,0.5,4.3,11.7,224,,,
2,99,0,0.0,0.0,0.0,0.0,0,0,60,90,...,0,54,311,0.8,8.1,10.6,293.8,,,
3,99,0,0.0,0.0,0.0,0.0,0,30,60,80,...,1,47,202,1.6,9.9,11.3,283.9,,,
4,99,0,0.0,0.0,0.0,0.0,0,0,60,90,...,0,51,311,0.7,8.4,12.2,108.9,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1,1,1.0,1.0,1.0,1.0,0,40,80,100,...,18,40,12,0.6,0.3,7.3,221,0.0,0.8,26.1
1413,1,1,1.0,1.0,1.0,1.0,10,0,70,90,...,89,697,15,2.5,0.9,12.3,335.8,0.0,0.2,32.9
1414,1,1,1.0,1.0,1.0,1.0,0,30,50,80,...,0,82,245,2.8,18.1,5,51.4,0.0,0.8,30.7
1415,1,1,1.0,1.0,1.0,1.0,0,10,40,80,...,0,80,232,1.1,8.8,7.9,278.4,0.0,0.3,28.9


In [27]:
# Separating column containing signs such as +, - etc
signed_col = list(cbc.loc[:, cbc.columns.str.contains('/M')])

In [28]:
cbc[signed_col] = cbc[signed_col].replace('+' , 1)
cbc[signed_col] = cbc[signed_col].replace('-', -1)
cbc[signed_col] = cbc[signed_col].replace(np.NaN , 0)
#cbc[signed_col] = cbc[signed_col].replace('*', np.NaN)
#cbc.to_csv("C:\\Users\\User\\Desktop\\copied folder\\ModelTraning (different synthetic data)\\temporary\\signed.csv", index=False)
cbc

Unnamed: 0,Code,Judgment,Positive(Morph.),Positive(Count),RBC Abnormal,PLT Abnormal,Q-Flag(Left Shift?),Q-Flag(Atypical Lympho?),Q-Flag(RBC Agglutination?),Q-Flag(Turbidity/HGB Interf?),...,[RET-UPP],[RET-TNC],[PLT-F(10^3/uL)],[H-IPF(%)],[IPF#(10^3/uL)],[HGB-O(g/dL)],[PLT-F2(10^3/uL)],Q-Flag(pRBC?),[Delta-HGB(g/dL)],[MCHC-O(g/dL)]
0,99,0,0.0,0.0,0.0,0.0,10,0,60,90,...,0,70,345,0.2,2.4,13.2,149.4,,,
1,99,0,0.0,0.0,0.0,0.0,0,60,70,90,...,1,62,250,0.5,4.3,11.7,224,,,
2,99,0,0.0,0.0,0.0,0.0,0,0,60,90,...,0,54,311,0.8,8.1,10.6,293.8,,,
3,99,0,0.0,0.0,0.0,0.0,0,30,60,80,...,1,47,202,1.6,9.9,11.3,283.9,,,
4,99,0,0.0,0.0,0.0,0.0,0,0,60,90,...,0,51,311,0.7,8.4,12.2,108.9,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1,1,1.0,1.0,1.0,1.0,0,40,80,100,...,18,40,12,0.6,0.3,7.3,221,0.0,0.8,26.1
1413,1,1,1.0,1.0,1.0,1.0,10,0,70,90,...,89,697,15,2.5,0.9,12.3,335.8,0.0,0.2,32.9
1414,1,1,1.0,1.0,1.0,1.0,0,30,50,80,...,0,82,245,2.8,18.1,5,51.4,0.0,0.8,30.7
1415,1,1,1.0,1.0,1.0,1.0,0,10,40,80,...,0,80,232,1.1,8.8,7.9,278.4,0.0,0.3,28.9


In [29]:
unknown_values = ['ERROR' , '----' , '++++' , '*', '@' , '    ']
cbc = cbc.replace(unknown_values, np.NaN)

In [30]:
# csv file before applying mean to fill the NULL
cbc.to_csv('Before_mean.csv', index=False)

In [31]:
cbc.mean()

  cbc.mean()


Code                 13.779817
Judgment              0.845448
Positive(Morph.)      0.775582
Positive(Count)       0.671136
RBC Abnormal          0.583627
                       ...    
[RET-UPP]             8.818631
[RET-TNC]           162.601270
Q-Flag(pRBC?)         1.824427
Length: 92, dtype: float64

In [32]:
cbc = cbc.fillna(cbc.mean())   

  cbc = cbc.fillna(cbc.mean())


In [34]:
cbc
cbc.to_csv("After_mean.csv",index=False)

In [None]:
print(cbc.isna().sum(axis=0))

In [None]:
# Finding column where mean is to apply
# mean_col = ['Q-Flag(pRBC?)','[Delta-HGB(g/dL)]','[MCHC-O(g/dL)]', '[FRC#/M]', '[FRC%/M]',
#            '[WBC-P(10^9/L)]','[TNC-P(10^9/L)]','PDW(fL)','MPV(fL)','P-LCR(%)','PCT(%)','RDW-SD(fL)',
#             'RDW-CV(%)','MCV/M']


# Filling null values in categorical data with mean
cbc = cbc.fillna(.mean(), inplace=True)

#cbc.to_csv("C:\\Users\\User\\Desktop\\copied folder\\ModelTraning (different synthetic data)\\temporary\\catag.csv", index=False)
cbc

In [None]:
cbc.to_csv('After_filling_mean.csv', index=False)

### Splitting into train and test set

In [None]:
# store the feature matrix (X) and response vector (y)
X = cbc.iloc[:, 1:]
y = cbc.iloc[:, 0]

In [None]:
X

In [None]:
y

In [None]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
X_train

In [None]:
y_train

In [None]:
# for filling null values in column that have some numeric values in it i.e. some mean can be generated 
# X_train = X_train.fillna(value=X_train.mean())

In [None]:
# X_train[['PDW(fL)','MPV(fL)','P-LCR(%)','PCT(%)']] = X_train[['PDW(fL)','MPV(fL)','P-LCR(%)','PCT(%)']].fillna(value=X_train[['PDW(fL)','MPV(fL)','P-LCR(%)','PCT(%)']].mean())

# TRAINING DIFFERENT ML MODEL ON PROCESSED DATASET

# 1- Naive Bayes

In [None]:
# commonly used library for all models
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)