In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

In [2]:
def processing_df(cbc):
    cbc = cbc.reset_index(drop=True)
    
    
    # removes all un-named columns from dataset
    cbc = cbc.loc[:, ~cbc.columns.str.contains('^Unnamed')]
    
    
    # removing patient personal info columns
    cbc = cbc.drop(['Nickname', 'Analyzer ID', 'Date', 'Time', 'Rack', 'Position',
                     'Sample No.', 'Sample Inf.', 'Order Type', 'Reception Date',
                     'Measurement Mode', 'Patient ID', 'Analysis Info.', 'Error(Func.)',
                     'Error(Result)', 'Order Info.', 'WBC Info.', 'PLT Info.',
                     'Rule Result', 'Validate', 'Validator', 
                     'Action Message (Check)', 'Action Message (Review)',
                     'Action Message (Retest)', 'Sample Comment', 'Patient Name',
                     'Birth', 'Sex', 'Patient Comment', 'Ward Name', 'Doctor Name',
                     'Output', 'Sequence No.', 'Discrete', 'Q-Flag(Blasts/Abn Lympho?)',
                     'Q-Flag(Blasts?)', 'Q-Flag(Abn Lympho?)'], axis=1)

    
    # grouping numeric categorical columns
    catag_data = ['WBC Abnormal', 'WBC Suspect', 'RBC Abnormal', 'RBC Suspect',
                     'PLT Abnormal', 'PLT Suspect','IP ABN(WBC)WBC Abn Scattergram', 
                     'IP ABN(WBC)Neutropenia',
                     'IP ABN(WBC)Neutrophilia', 'IP ABN(WBC)Lymphopenia',
                     'IP ABN(WBC)Lymphocytosis', 'IP ABN(WBC)Monocytosis',
                     'IP ABN(WBC)Eosinophilia', 'IP ABN(WBC)Basophilia',
                     'IP ABN(WBC)Leukocytopenia', 'IP ABN(WBC)Leukocytosis',
                     'IP ABN(WBC)NRBC Present', 'IP ABN(WBC)IG Present',
                     'IP ABN(RBC)RBC Abn Distribution', 'IP ABN(RBC)Dimorphic Population',
                     'IP ABN(RBC)Anisocytosis', 'IP ABN(RBC)Microcytosis',
                     'IP ABN(RBC)Macrocytosis', 'IP ABN(RBC)Hypochromia',
                     'IP ABN(RBC)Anemia', 'IP ABN(RBC)Erythrocytosis',
                     'IP ABN(RBC)RET Abn Scattergram', 'IP ABN(RBC)Reticulocytosis',
                     'IP ABN(PLT)PLT Abn Distribution', 'IP ABN(PLT)Thrombocytopenia',
                     'IP ABN(PLT)Thrombocytosis', 'IP ABN(PLT)PLT Abn Scattergram',
                     'IP SUS(WBC)Blasts/Abn Lympho?', 'IP SUS(WBC)Blasts?',
                     'IP SUS(WBC)Abn Lympho?', 'IP SUS(WBC)Left Shift?',
                     'IP SUS(WBC)Atypical Lympho?', 'IP SUS(RBC)RBC Agglutination?',
                     'IP SUS(RBC)Turbidity/HGB Interf?', 'IP SUS(RBC)Iron Deficiency?',
                     'IP SUS(RBC)HGB Defect?', 'IP SUS(RBC)Fragments?',
                     'IP SUS(PLT)PLT Clumps?']

    cbc[catag_data] = cbc[catag_data].fillna(value=0)
    
    
    # grouping descriptive categorical columns
    cbc['Judgment'] = cbc['Judgment'].map({'Positive': 1, 'Negative': 0})

    cbc['Positive(Diff.)'] = cbc['Positive(Diff.)'].map({'Diff.': 1})

    cbc['Positive(Morph.)'] = cbc['Positive(Morph.)'].map({'Morph.': 1})

    cbc['Positive(Count)'] = cbc['Positive(Count)'].map({'Count': 1})

    
    # replacing all NaN vallues by 0 as per suggested by the domain expert
    cbc[['Judgment', 'Positive(Diff.)', 'Positive(Count)', 'Positive(Morph.)']] = cbc[
            ['Judgment', 'Positive(Diff.)', 'Positive(Count)', 'Positive(Morph.)']].fillna(value=0)
 

    # grouping columns having signed values only
    signed_col = list(cbc.loc[:, cbc.columns.str.contains('/M')])
    cbc[signed_col] = cbc[signed_col].replace('+' , 1)
    cbc[signed_col] = cbc[signed_col].replace('-', -1)
    cbc[signed_col] = cbc[signed_col].replace(np.NaN , 0)
    
    #1
    '''unknown_values = ['*', '@' , '    ']
    cbc = cbc.replace(unknown_values, np.NaN)
    
    #2
    new = pd.DataFrame()
    new['NULL'] = cbc.isnull().sum()
    new['percen']=new['NULL']/len(cbc) *100
    new['column'] = cbc.columns
    
    #3
    filter_df = new[new['percen']>70]
    # fil saves the name of column that are supposed to be kept on the basis of percentage ratio
    fil = list(filter_df['column'])'''
    
    
    # grouping (continuous) numeric data columns
    numeric_data = ['Q-Flag(Left Shift?)', 'Q-Flag(Atypical Lympho?)', 'Q-Flag(RBC Agglutination?)',
                 'Q-Flag(Turbidity/HGB Interf?)', 'Q-Flag(Iron Deficiency?)', 'Q-Flag(HGB Defect?)',
                 'Q-Flag(Fragments?)', 'Q-Flag(PLT Clumps?)', 'WBC(10^9/L)', 'RBC(10^12/L)',
                 'HGB(g/dL)', 'HCT(%)', 'MCV(fL)', 'MCH(pg)', 'MCHC(g/dL)', 'PLT(10^3/uL)',
                 'RDW-SD(fL)', 'RDW-CV(%)', 'PDW(fL)', 'MPV(fL)', 'P-LCR(%)', 'PCT(%)',
                 'NRBC#(10^3/uL)', 'NRBC%(%)', 'NEUT#(10^3/uL)', 'LYMPH#(10^3/uL)', 'MONO#(10^3/uL)',
                 'EO#(10^3/uL)', 'BASO#(10^3/uL)', 'NEUT%(%)', 'LYMPH%(%)', 'MONO%(%)', 'EO%(%)',
                 'BASO%(%)', 'IG#(10^3/uL)', 'IG%(%)', 'RET%(%)', 'RET#(10^9/L)', 'IRF(%)',
                 'LFR(%)', 'MFR(%)', 'HFR(%)', 'RET-He(pg)', 'IPF(%)', '[PLT-I(10^3/uL)]',
                 '[MicroR(%)]', '[MacroR(%)]', '[TNC(10^9/L)]', '[WBC-N(10^9/L)]', '[TNC-N(10^9/L)]',
                 '[BA-N#(10^3/uL)]', '[BA-N%(%)]', '[WBC-D(10^9/L)]', '[TNC-D(10^9/L)]',
                 '[NEUT#&(10^3/uL)]', '[NEUT%&(%)]', '[LYMP#&(10^3/uL)]', '[LYMP%&(%)]',
                 '[HFLC#(10^3/uL)]', '[HFLC%(%)]', '[BA-D#(10^3/uL)]', '[BA-D%(%)]', '[NE-SSC(ch)]',
                 '[NE-SFL(ch)]', '[NE-FSC(ch)]', '[LY-X(ch)]', '[LY-Y(ch)]', '[LY-Z(ch)]',
                 '[MO-X(ch)]', '[MO-Y(ch)]', '[MO-Z(ch)]', '[NE-WX]', '[NE-WY]', '[NE-WZ]',
                 '[LY-WX]', '[LY-WY]', '[LY-WZ]', '[MO-WX]', '[MO-WY]', '[MO-WZ]', '[WBC-P(10^9/L)]',
                 '[TNC-P(10^9/L)]', '[RBC-O(10^12/L)]', '[PLT-O(10^3/uL)]', '[RBC-He(pg)]',
                 '[Delta-He(pg)]', '[RET-Y(ch)]', '[RET-RBC-Y(ch)]', '[IRF-Y(ch)]', '[FRC#(10^12/L)]',
                 '[FRC%(%)]', '[HYPO-He(%)]', '[HYPER-He(%)]', '[RPI]', '[RET-UPP]', '[RET-TNC]',
                 '[PLT-F(10^3/uL)]', '[H-IPF(%)]', '[IPF#(10^3/uL)]', 'WBC-BF(10^3/uL)',
                 'RBC-BF(10^6/uL)', 'MN#(10^3/uL)', 'PMN#(10^3/uL)', 'MN%(%)', 'PMN%(%)',
                 'TC-BF#(10^3/uL)', '[HF-BF#(10^3/uL)]', '[HF-BF%(/100WBC)]', '[NE-BF#(10^3/uL)]',
                 '[NE-BF%(%)]', '[LY-BF#(10^3/uL)]', '[LY-BF%(%)]', '[MO-BF#(10^3/uL)]', '[MO-BF%(%)]',
                 '[EO-BF#(10^3/uL)]', '[EO-BF%(%)]', '[RBC-BF2(10^6/uL)]', 'HPC#(10^3/uL)',
                 '[HGB-O(g/dL)]', '[PLT-F2(10^3/uL)]', 'IP SUS(RBC)pRBC?', 'Q-Flag(pRBC?)',
                 '[Delta-HGB(g/dL)]', '[MCHC-O(g/dL)]', '[WBC(10^3/uL)]', '[RBC(10^6/uL)]',
                 '[RBC-I(10^6/uL)]', '[RBC-O(10^6/uL)]', '[NEUT#(10^3/uL)]', '[LYMPH#(10^3/uL)]',
                 '[MONO#(10^3/uL)]', '[EO#(10^3/uL)]', '[NEUT%(%)]', '[LYMPH%(%)]', '[MONO%(%)]',
                 '[EO%(%)]', '[MN#(10^3/uL)]', '[PMN#(10^3/uL)]', '[HF#(10^3/uL)]', '[MN%(%)]',
                 '[PMN%(%)]', '[HF%(/100WBC)]', '[TC#(10^3/uL)]', '[HPC%(%)]']
    unknown_values = ['ERROR' , '----' , '++++' , '*', '@' , '    ']
    cbc[numeric_data] = cbc[numeric_data].replace(unknown_values, np.NaN)
    #changing all dtypes to float so that mean can be applied
    #cbc = cbc.astype(float)
    
    # for filling null values in column that have some numeric values in it i.e. some mean can be generated 
    cbc[numeric_data] = cbc[numeric_data].fillna(value=cbc[numeric_data].mean())
    
    # for filling null values in column that have no numeric values in it i.e. no mean can be generated so replacing it by 0 
    cbc[numeric_data] = cbc[numeric_data].fillna(value=0)
    
    # scaling all parameters in order to have equal weightage in the model training
    numeric_df = pd.DataFrame(cbc[numeric_data])
    scaler = StandardScaler()
    standard_cbc = scaler.fit_transform(numeric_df)
    standard_cbc = pd.DataFrame(standard_cbc, columns=numeric_data)

    cbc[numeric_data] = standard_cbc

    '''
    #4
    # dropping col having null values > 70
    cbc = pd.DataFrame(cbc.drop(fil, axis=1))
    
    
    # omitting un-needed column from dataset  
    col_names = pd.Series(cbc.columns)
    omit_col_contining = ['TNC', '/M']
    filter_col = col_names[col_names.str.contains('|'.join(omit_col_contining))]
    keep_col = list(filter_col)
    cbc = cbc.drop(keep_col, axis=1)'''
    
    cbc = cbc[['HGB(g/dL)', 'PLT(10^3/uL)', 'Judgment', 'Positive(Diff.)',
       'Positive(Morph.)', 'Positive(Count)', 'RBC Abnormal', 'RBC Suspect',
       'PLT Abnormal', 'PLT Suspect', 'IP ABN(RBC)RBC Abn Distribution',
       'IP ABN(RBC)Dimorphic Population', 'IP ABN(RBC)Anisocytosis',
       'IP ABN(RBC)Microcytosis', 'IP ABN(RBC)Macrocytosis',
       'IP ABN(RBC)Hypochromia', 'IP ABN(RBC)Anemia',
       'IP ABN(RBC)Erythrocytosis', 'IP ABN(RBC)RET Abn Scattergram',
       'IP ABN(RBC)Reticulocytosis', 'IP ABN(PLT)PLT Abn Distribution',
       'IP ABN(PLT)Thrombocytopenia', 'IP ABN(PLT)Thrombocytosis',
       'IP ABN(PLT)PLT Abn Scattergram', 'IP SUS(RBC)RBC Agglutination?',
       'IP SUS(RBC)Turbidity/HGB Interf?', 'IP SUS(RBC)Iron Deficiency?',
       'IP SUS(RBC)HGB Defect?', 'IP SUS(RBC)Fragments?',
       'IP SUS(PLT)PLT Clumps?', 'Q-Flag(RBC Agglutination?)',
       'Q-Flag(Turbidity/HGB Interf?)', 'Q-Flag(Iron Deficiency?)',
       'Q-Flag(HGB Defect?)', 'Q-Flag(Fragments?)', 'Q-Flag(PLT Clumps?)',
       '[RBC(10^6/uL)]', 'HCT(%)', 'MCV(fL)', 'MCH(pg)', 'MCHC(g/dL)',
       'RDW-SD(fL)', 'RDW-CV(%)', 'PDW(fL)', 'MPV(fL)', 'P-LCR(%)', 'PCT(%)',
       'NRBC#(10^3/uL)', 'RET%(%)', 'RET#(10^9/L)', 'IRF(%)', 'LFR(%)',
       'MFR(%)', 'HFR(%)', 'RET-He(pg)', '[PLT-I(10^3/uL)]', '[MicroR(%)]',
       '[MacroR(%)]', '[RBC-O(10^6/uL)]', '[PLT-O(10^3/uL)]', '[RBC-He(pg)]',
       '[Delta-He(pg)]', '[RET-Y(ch)]', '[RET-RBC-Y(ch)]', '[IRF-Y(ch)]',
       '[FRC#(10^12/L)]']]
    
    #5
    # checking if dtype of all columns is in float64
    #print(cbc.dtypes)
    return cbc

In [3]:
# index_col = False --> not to use the first col as index
cbc = pd.read_csv("ResearchCBC_data_final.csv", index_col=False)
cbc

Unnamed: 0,Code,Nickname,Analyzer ID,Date,Time,Rack,Position,Sample No.,Sample Inf.,Order Type,...,[MN%(%)],[MN%/M],[PMN%(%)],[PMN%/M],[HF%(/100WBC)],[HF%/M],[TC#(10^3/uL)],[TC#/M],[HPC%(%)],[HPC%/M]
0,99,XN-1000-1-A,XN-20^11551,10/7/2017,10:28:14,1,2.0,NBD/1617/021437,M,Initial,...,,,,,,,,,,
1,99,XN-1000-1-A,XN-20^11551,9/1/2014,11:18:16,17,1.0,3D-5,M,Initial,...,,,,,,,,,,
2,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:49:23,1,1.0,ERR000000000001,M,Initial,...,,,,,,,,,,
3,99,XN-1000-1-A,XN-20^11551,6/29/2016,10:57:59,16,1.0,3D-143,M,Initial,...,,,,,,,,,,
4,99,XN-1000-1-A,XN-20^11551,12/11/2014,19:56:06,2,1.0,3D-69,M,Initial,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,1,XN-1000-1-A,XN-20^11551,1/13/2014,10:47:03,11,1.0,3D-62,M,Initial,...,,,,,,,,,,
1413,1,XN-1000-1-A,XN-20^11551,1/10/2014,10:11:38,9,1.0,3D-52,M,Initial,...,,,,,,,,,,
1414,1,XN-1000-1-A,XN-20^11551,1/8/2014,10:06:16,7,1.0,3D-40,M,Initial,...,,,,,,,,,,
1415,1,XN-1000-1-A,XN-20^11551,1/6/2014,11:38:28,19,2.0,3D-28,A,Initial,...,,,,,,,,,,


In [4]:
unique_val = cbc['Code'].unique()
unique_val


# Since 'Code'i.e. is prediction column is already in numeric category form therefor no need for below mapping

#cbc['Code'] = cbc['Code'].map({'HM': 0, 'SEP': 1, 'MDA':2, 'AA':3, 
#                                           'ITP':4, 'NM':5, 'MF+IDA':6, 'IDA':7, 
#                                           'MPY':8, 'MA':9, 'ITP+IDA':10 , 'CML-CP+IDA':11 , 
#                                           'CGD':12 , 'HCV':13 , 'ETS':14 , 'Hypersplenism':15 , 
#                                           'PV':16 , 'PRCA':17 , 'ACD':18 , 'PCA':19 , 'CDA':20 , 
#                                           'GA' :21 , 'Extremly Increased Iron deposition':22})


array([99, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
       10,  9,  8,  7,  6,  5,  4,  3,  2,  1], dtype=int64)

In [5]:
# store the feature matrix (X) and response vector (y)
X = cbc.iloc[:, 1:]
y = cbc.iloc[:, 0]

In [6]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [7]:
# rearranging indexes of train_sets and test_set in order to do standardization on X sets
# there is a glitch that happens if indexes are not rearranged
# i.e. train set has 68 column which are selected randomly from the entire cbc dataset
# when strandardization is done on this train set it changes all records to NaN vallue if it has a index number > 68

# drop = True, drops the additional column of original indexes from dataframe that is made by default
# by default drop = False
X_train = X_train.reset_index(drop=True)

# reseting indexes of y_train, X_test and y_test as well
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


In [8]:
# note that an additional column of orginal indexes
X_train

Unnamed: 0,Nickname,Analyzer ID,Date,Time,Rack,Position,Sample No.,Sample Inf.,Order Type,Reception Date,...,[MN%(%)],[MN%/M],[PMN%(%)],[PMN%/M],[HF%(/100WBC)],[HF%/M],[TC#(10^3/uL)],[TC#/M],[HPC%(%)],[HPC%/M]
0,XN-1000-1-A,XN-20^11551,5/14/2014,9:17:24,5,1.0,3D-81,M,Initial,5/14/2014 9:15,...,,,,,,,,,,
1,XN-1000-1-A,XN-20^11551,6/23/2014,14:06:27,33,1.0,3D-138,M,Initial,6/23/2014 14:04,...,,,,,,,,,,
2,XN-1000-1-A,XN-20^11551,7/2/2014,12:27:34,16,1.0,3D-06,M,Initial,7/2/2014 12:25,...,,,,,,,,,,
3,XN-1000-1-A,XN-20^11551,7/14/2014,12:32:22,16,1.0,3D-72R,M,Initial,7/14/2014 12:30,...,,,,,,,,,,
4,XN-1000-1-A,XN-20^11551,10/17/2014,11:46:42,20,2.0,3D-56,A,Initial,10/17/2014 11:43,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128,XN-1000-1-A,XN-20^11551,8/20/2014,11:08:31,16,2.0,3D-123,A,Initial,8/20/2014 11:05,...,,,,,,,,,,
1129,XN-1000-1-A,XN-20^11551,5/26/2014,10:41:10,9,1.0,3D-171,M,Initial,5/26/2014 10:39,...,,,,,,,,,,
1130,XN-1000-1-A,XN-20^11551,8/21/2014,15:20:23,36,1.0,3D-134,M,Initial,8/21/2014 15:18,...,,,,,,,,,,
1131,XN-1000-1-A,XN-20^11551,2/12/2014,11:37:30,17,2.0,3D-54,A,Initial,2/12/2014 11:34,...,,,,,,,,,,


In [9]:
X_train = processing_df(X_train)
X_train

  cbc[numeric_data] = cbc[numeric_data].fillna(value=cbc[numeric_data].mean())


Unnamed: 0,HGB(g/dL),PLT(10^3/uL),Judgment,Positive(Diff.),Positive(Morph.),Positive(Count),RBC Abnormal,RBC Suspect,PLT Abnormal,PLT Suspect,...,[MicroR(%)],[MacroR(%)],[RBC-O(10^6/uL)],[PLT-O(10^3/uL)],[RBC-He(pg)],[Delta-He(pg)],[RET-Y(ch)],[RET-RBC-Y(ch)],[IRF-Y(ch)],[FRC#(10^12/L)]
0,-1.430595,0.345034,1,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.511532,-0.642334,0.0,0.371304,-2.422124,-0.793992,-2.660615,-2.879554,-2.130032,-0.412357
1,0.190020,0.049073,1,0.0,1.0,0.0,1.0,1.0,1.0,0.0,...,1.255696,-0.461298,0.0,0.087205,-1.230566,-0.658071,-1.190666,-1.153083,-0.836109,1.082535
2,-0.121637,-0.940864,1,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,-0.755877,3.259998,0.0,-0.933623,1.412527,1.550644,1.429046,1.217083,1.019821,-0.394919
3,-0.526791,-1.048023,1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,-0.710729,-0.380838,0.0,-1.054004,0.762586,1.618604,1.130690,0.735377,0.869516,-0.702458
4,-0.495625,0.294006,1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.750860,-0.441183,0.0,0.217217,0.805915,-0.997873,0.210152,0.769480,0.238892,-0.757942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128,1.311984,-0.619390,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.806041,-0.280262,0.0,-0.673601,1.174215,0.463277,0.966958,1.050831,1.000216,-0.757942
1129,-1.711086,0.355239,1,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,2.053302,-0.763025,0.0,0.424271,-2.292136,-0.760011,-2.449583,-2.670672,-2.221522,-0.640633
1130,-0.651453,-1.002098,1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,-0.775942,0.846184,0.0,-1.001036,1.087556,-0.080407,0.737733,0.978361,0.761690,-0.708799
1131,-0.028140,0.345034,1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.412942,-0.320493,0.0,0.327967,-0.840602,-0.488170,-0.750409,-0.709743,-0.617188,1.681760


In [10]:
## checking again for null values in "cbc" dataframe
np.array(X_train.isna().sum(axis=0))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

# finding correlation between columns 
corr_matrix = X_train.corr()
# finding correlation wrt 'Conclusion' column
corr_matrix[y_train].sort_values(ascending=False)

In [11]:
X_test = processing_df(X_test)
X_test

  cbc[numeric_data] = cbc[numeric_data].fillna(value=cbc[numeric_data].mean())


Unnamed: 0,HGB(g/dL),PLT(10^3/uL),Judgment,Positive(Diff.),Positive(Morph.),Positive(Count),RBC Abnormal,RBC Suspect,PLT Abnormal,PLT Suspect,...,[MicroR(%)],[MacroR(%)],[RBC-O(10^6/uL)],[PLT-O(10^3/uL)],[RBC-He(pg)],[Delta-He(pg)],[RET-Y(ch)],[RET-RBC-Y(ch)],[IRF-Y(ch)],[FRC#(10^12/L)]
0,0.999961,0.243340,1,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,-0.243155,-0.087996,0.0,0.317078,0.490426,0.923544,0.709074,0.528246,0.867147,-0.413621
1,1.090394,-0.047777,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.786982,-0.147169,0.0,-0.059626,0.727005,0.159073,0.590958,0.712945,0.836661,-0.639427
2,0.276499,-0.750253,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,-0.630203,-0.265515,0.0,-0.729322,0.404397,-1.260660,-0.142793,0.452687,0.013536,-0.221821
3,-0.416819,0.977459,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.164766,-0.285239,0.0,0.998734,-0.756994,0.159073,-0.389763,-0.626123,-0.248644,1.054121
4,-0.567540,-1.212242,1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,-0.365639,-0.226066,0.0,-1.201697,0.597962,-1.187853,0.998996,0.608002,-4.339881,-0.639427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,0.517653,-0.642667,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,-0.777183,0.128972,0.0,-0.657569,1.092628,-0.714609,0.805715,0.981598,0.708620,-0.639427
280,1.391837,-0.611023,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,-0.752686,-0.107720,0.0,-0.609734,0.791527,-0.168558,0.515794,0.754922,0.504363,-0.639427
281,0.035345,-0.522423,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.390135,-0.226066,0.0,-0.561898,0.210831,0.741527,0.483580,0.297372,0.516557,-0.247667
282,-0.537396,-1.035041,1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,-0.581210,0.207870,0.0,-0.998397,0.339875,2.270469,1.049106,0.402315,0.763495,-0.472113


## MODEL TRAINING

X_train.to_csv("C:\\Users\\User\\Desktop\\copied folder\\ModelTraning (different synthetic data)\\temporary\\train_cbc.csv", index=False)
X_test.to_csv("C:\\Users\\User\\Desktop\\copied folder\\ModelTraning (different synthetic data)\\temporary\\test_cbc.csv", index=False)


In [12]:
# commonly used library for all models
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

### APPLYING NAIVE BAYES

In [13]:
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [14]:
# making predictions on the testing set
y_pred = gnb.predict(X_test)

In [15]:
# comparing actual response values (y_test) with predicted response values (y_pred)
print("Gaussian Naive Bayes model")
print("Accuracy(in %):", accuracy_score(y_test, y_pred)*100)
print("F1 score(in %):", f1_score(y_test, y_pred, average="macro")*100)
print("Precision score(in %):", precision_score(y_test, y_pred, average="macro")*100)
print("Recall score(in %):", recall_score(y_test, y_pred, average="macro")*100)

Gaussian Naive Bayes model
Accuracy(in %): 10.211267605633804
F1 score(in %): 4.535324711024533
Precision score(in %): 4.3776782704993025
Recall score(in %): 9.94918125352908


  _warn_prf(average, modifier, msg_start, len(result))


### APPLYING LOGISTIC REGRESSION

In [None]:
#Training the model
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)

In [None]:
# making predictions on the testing set
y_pred = LR.predict(X_test)
#Finding the confusion matrix
from sklearn.metrics import mean_squared_error, confusion_matrix
MSE = mean_squared_error(y_test, y_pred)
print(MSE)

In [None]:
# comparing actual response values (y_test) with predicted response values (y_pred)
print("Logestic model")
print("Accuracy(in %):", accuracy_score(y_test, y_pred)*100)
print("F1 score(in %):", f1_score(y_test, y_pred, average="macro")*100)
print("Precision score(in %):", precision_score(y_test, y_pred, average="macro")*100)
print("Recall score(in %):", recall_score(y_test, y_pred, average="macro")*100)

### APPLYING SUPPORT VECTOR MACHINE CLASSIFIER

In [None]:
# PART 1
from sklearn.svm import SVC
model = SVC(kernel='rbf')
model.fit(X_train, y_train)

In [None]:
# making predictions on the testing set
predictions = model.predict(X_test)

In [None]:
# comparing actual response values (y_test) with predicted response values (y_pred)
print("Support vector machine classifier model (1):")
print("Accuracy(in %):", accuracy_score(y_test, predictions)*100)
print("F1 score(in %):", f1_score(y_test, predictions, average="macro")*100)
print("Precision score(in %):", precision_score(y_test, predictions, average="macro")*100)
print("Recall score(in %):", recall_score(y_test, predictions, average="macro")*100)

In [None]:
#PART 2

# Alter SVM with some parameters
model_poly = SVC(kernel='poly', degree=7, gamma=2)
model_poly.fit(X_train, y_train)

In [None]:
# making predictions on the testing set
y_pred_poly = model_poly.predict(X_test)

In [None]:
# comparing actual response values (y_test) with predicted response values (y_pred)
print("Support vector machine classifier model (2):")
print("Accuracy(in %):", accuracy_score(y_test, y_pred_poly)*100)
print("F1 score(in %):", f1_score(y_test, y_pred_poly, average="macro")*100)
print("Precision score(in %):", precision_score(y_test, y_pred_poly, average="macro")*100)
print("Recall score(in %):", recall_score(y_test, y_pred_poly, average="macro")*100)

In [None]:
# PART 3

# Alter SVM with some parameters
model_linear = SVC(kernel='linear')
model_linear.fit(X_train, y_train)
#print("Accuracy = ", (round((accuracy_score(y_test, predictions_linear))*100)))

In [None]:
# making predictions on the testing set
predictions_linear = model_linear.predict(X_test)

In [None]:
# comparing actual response values (y_test) with predicted response values (y_pred)
print("Support vector machine classifier model (3):")
print("Accuracy(in %):", accuracy_score(y_test, predictions_linear)*100)
print("F1 score(in %):", f1_score(y_test, predictions_linear, average="macro")*100)
print("Precision score(in %):", precision_score(y_test, predictions_linear, average="macro")*100)
print("Recall score(in %):", recall_score(y_test, y_pred_poly, average="macro")*100)

In [None]:
# PART 4

# Alter SVM with some parameters
model_sigmoid = SVC(kernel='sigmoid')
model_sigmoid.fit(X_train, y_train)

In [None]:
# making predictions on the testing set
predictions_sig = model_linear.predict(X_test)

In [None]:
# comparing actual response values (y_test) with predicted response values (y_pred)
print("Support vector machine classifier model (4):")
print("Accuracy(in %):", accuracy_score(y_test, predictions_sig)*100)
print("F1 score(in %):", f1_score(y_test, predictions_sig, average="macro")*100)
print("Precision score(in %):", precision_score(y_test, predictions_sig, average="macro")*100)
print("Recall score(in %):", recall_score(y_test, predictions_sig, average="macro")*100)

### APPLYING EXTRA TREE REGRESSOR FOR IMPORTANT FEATURE SELECTION

In [None]:
# Feature selection by ExtraTreesRegressor(model based)
from sklearn.ensemble import ExtraTreesRegressor
reg= ExtraTreesRegressor()
reg.fit(X_train, y_train)

reg.feature_importances_

In [None]:
import matplotlib.pyplot as plt
feat_importances = pd.Series(reg.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
# Creating heat map for correlation study
import seaborn as sns
corr = cbc.corr()
sns.heatmap(corr, xticklabels=corr.columns.values,yticklabels=corr.columns.values)
plt.show()

### APPLYING DECISION TREE CLASSIFIER

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_model=DecisionTreeClassifier()
# fit independent varaibles to the dependent variables
decision_model.fit(X_train,y_train)

In [None]:
# making predictions on the testing set
y_pred = decision_model.predict(X_test)

In [None]:
# comparing actual response values (y_test) with predicted response values (y_pred)
print("Decision Tree Classifier model (4):")
print("Accuracy(in %):", accuracy_score(y_test, y_pred)*100)
print("F1 score(in %):", f1_score(y_test, y_pred, average="macro")*100)
print("Precision score(in %):", precision_score(y_test, y_pred, average="macro")*100)
print("Recall score(in %):", recall_score(y_test, y_pred, average="macro")*100)

### APPLYING K-NEAREST NEIGHBOR(KNN)

In [None]:
# KNN Classifier
#The KNN algorithm doesn't work well with high dimensional data because with large number of dimensions, 
#it becomes difficult for the algorithm to calculate the distance in each dimension.
N=len(X_train)
from math import sqrt
k = int(sqrt(N/2))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=k)
knn_model = model.fit(X_train, y_train)

In [None]:
# making predictions on the testing set
y_pred = knn_model.predict(X_test)

In [None]:
# comparing actual response values (y_test) with predicted response values (y_pred)
print("KNN model")
print("Accuracy(in %):", accuracy_score(y_test, y_pred)*100)
print("F1 score(in %):", f1_score(y_test, y_pred, average="macro")*100)
print("Precision score(in %):", precision_score(y_test, y_pred, average="macro")*100)
print("Recall score(in %):", recall_score(y_test, y_pred, average="macro")*100)

# RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
rn_model = RandomForestClassifier(n_estimators = 100, random_state = 1)
rn_model.fit(X_train, y_train)

In [None]:
# making predictions on the testing set
y_pred = rn_model.predict(X_test)

In [None]:
# comparing actual response values (y_test) with predicted response values (y_pred)
print("Random Forest Classifier model")
print("Accuracy(in %):", accuracy_score(y_test, y_pred)*100)
print("F1 score(in %):", f1_score(y_test, y_pred, average="macro")*100)
print("Precision score(in %):", precision_score(y_test, y_pred, average="macro")*100)
print("Recall score(in %):", recall_score(y_test, y_pred, average="macro")*100)

In [None]:
# for creating model file to be used in flask app
#pickle.dump(rn_model, open('rn_model.pkl', 'wb'))

# STOCHASTIC GRADIENT DESCENT

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="log", max_iter=20, penalty="elasticnet", random_state=1)
clf.fit(X_train, y_train)

In [None]:
# making predictions on the testing set
y_pred = clf.predict(X_test)

In [None]:
#comparing actual response values (y_test) with predicted response values (y_pred)
print("Stochastic Gradient Classifier model")
print("Accuracy(in %):", accuracy_score(y_test, y_pred)*100)
print("F1 score(in %):", f1_score(y_test, y_pred, average="macro")*100)
print("Precision score(in %):", precision_score(y_test, y_pred, average="macro")*100)
print("Recall score(in %):", recall_score(y_test, y_pred, average="macro")*100)

# Artificial Neural Network

In [None]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(keras.Input(shape=(66,)))
classifier.add(Dense(200, activation="relu"))

# Adding the second hidden layer
classifier.add(Dense(200, activation="relu"))

# Adding the third hidden layer
classifier.add(Dense(200, activation="relu"))

# Adding the forth hidden layer
classifier.add(Dense(64, activation="relu"))

# Adding the fifth hidden layer
classifier.add(Dense(32, activation="relu"))

# Adding the sixth hidden layer
classifier.add(Dense(20, activation="relu"))


# Adding the output layer
classifier.add(Dense(1, activation="sigmoid"))

In [None]:
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)