In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


In [2]:
#loading the datset
train=pd.read_csv('train_genetic_disorders.csv')
test=pd.read_csv('test_genetic_disorders.csv')
data_train=train.copy()
data_test=test.copy()

In [3]:
#check all null rows
data_train[data_train.isnull().all(1)].shape
print("Null rows_train:",data_train[data_train.isnull().all(1)].shape[0])

Null rows_train: 1072


In [4]:
#check all null rows
data_test[data_test.isnull().all(1)].shape
print("Null rows_test:",data_test[data_test.isnull().all(1)].shape[0])

Null rows_test: 173


In [5]:
#subset where no all rows being null
data_fea_train=data_train[data_train.isnull().all(1)!=True]

In [6]:
#shape of train after removing null rows
data_fea_train.shape

(21011, 45)

In [7]:
#subset where no all rows being null
data_fea_test=data_test[data_test.isnull().all(1)!=True]

In [8]:
# Dropping the features
data_fea_train=data_fea_train.drop(columns=['Patient Id','Patient First Name','Family Name','Father\'s name','Institute Name','Location of Institute','Test 1','Test 2','Test 3','Test 4',
'Test 5','Parental consent'])
data_fea_test=data_fea_test.drop(columns=['Patient Id','Patient First Name','Family Name','Father\'s name','Institute Name','Location of Institute','Test 1','Test 2','Test 3','Test 4',
'Test 5','Parental consent'])

In [9]:
print(data_fea_train.shape,data_fea_test.shape)

(21011, 33) (9290, 31)


In [10]:
# renaming the columns
data_fea_train=data_fea_train.rename(columns={"Genes in mother's side":'defective_mother',
                    'Inherited from father':'defective_father',
                    'Maternal gene':'maternal_gene','Paternal gene':'paternal_gene',
                    'Respiratory Rate (breaths/min)':'respiratory_rate','Heart Rate (rates/min':'heart_rate',
                    'Parental consent':'parental_consent','Follow-up':'follow_up','Birth asphyxia':'birth_asphyxia',
                    'Autopsy shows birth defect (if applicable)':'birth_defect_autopsy','Place of birth':'birth_place',
                    'Folic acid details (peri-conceptional)':'folic_acid_periconceptional',
                    'H/O serious maternal illness':'maternal_illness','H/O radiation exposure (x-ray)':'radiation_exposure',
                    'H/O substance abuse':'substance_abuse','Assisted conception IVF/ART':'assisted_conception',
                    'History of anomalies in previous pregnancies':'previous_pregnancy_anomalies',
                    'Birth defects':'birth_defects','Blood test result':'blood_test_result','Genetic Disorder':'genetic_disorder',
                    'Disorder Subclass':'disorder_subclass','Patient Age':'patient_age','Blood cell count (mcL)':'blood_cell_count',
                    "Mother's age":'mother_age',"Father's age":'father_age','No. of previous abortion':'num_previous_abortion',
                    'White Blood cell count (thousand per microliter)':'WBC_count'})

In [11]:
data_fea_test=data_fea_test.rename(columns={"Genes in mother's side":'defective_mother',
                    'Inherited from father':'defective_father',
                    'Maternal gene':'maternal_gene','Paternal gene':'paternal_gene',
                    'Respiratory Rate (breaths/min)':'respiratory_rate','Heart Rate (rates/min':'heart_rate',
                    'Parental consent':'parental_consent','Follow-up':'follow_up','Birth asphyxia':'birth_asphyxia',
                    'Autopsy shows birth defect (if applicable)':'birth_defect_autopsy','Place of birth':'birth_place',
                    'Folic acid details (peri-conceptional)':'folic_acid_periconceptional',
                    'H/O serious maternal illness':'maternal_illness','H/O radiation exposure (x-ray)':'radiation_exposure',
                    'H/O substance abuse':'substance_abuse','Assisted conception IVF/ART':'assisted_conception',
                    'History of anomalies in previous pregnancies':'previous_pregnancy_anomalies',
                    'Birth defects':'birth_defects','Blood test result':'blood_test_result','Genetic Disorder':'genetic_disorder',
                    'Disorder Subclass':'disorder_subclass','Patient Age':'patient_age','Blood cell count (mcL)':'blood_cell_count',
                    "Mother's age":'mother_age',"Father's age":'father_age','No. of previous abortion':'num_previous_abortion',
                    'White Blood cell count (thousand per microliter)':'WBC_count'})

In [12]:
# missing target variables
data_fea_train.iloc[:,-2].isnull().sum(),data_fea_train.iloc[:,-1].isnull().sum()

(2049, 2068)

In [13]:
# removing rows of missing target variables
data_fea_train=data_fea_train[(data_fea_train['genetic_disorder'].isnull()!=True)&(data_fea_train['disorder_subclass'].isnull()!=True)]

In [14]:
data_fea_train.shape

(17160, 33)

In [15]:
#Subsetting
X=data_fea_train.iloc[:,:-2]
y1=data_fea_train.iloc[:,-2]
y2=data_fea_train.iloc[:,-1]

In [16]:
# shape of features,target variables
X.shape,y1.shape,y2.shape

((17160, 31), (17160,), (17160,))

In [17]:
# test data
X_test=data_fea_test

In [18]:
#converting dissimilar datatype to one
for i in X_test.columns:
    if X_test[i].dtype!=X[i].dtype:
        X_test[i]=X_test[i].astype(X[i].dtype.name)

In [19]:
# Data Cleaning
X_test=X_test.replace('-99',np.nan)
# Cleaning_data
# replace '-' with other values
X['radiation_exposure']=X['radiation_exposure'].replace('-','others')
X['substance_abuse']=X['substance_abuse'].replace('-','others')
# Cleaning_data
X_test['radiation_exposure']=X_test['radiation_exposure'].replace('-','others')
X_test['substance_abuse']=X_test['substance_abuse'].replace('-','others')

In [20]:
# Data Cleaning
X_test['WBC_count']=X_test['WBC_count'].mask(X_test['WBC_count']<0,np.nan)
X_test['num_previous_abortion']=X_test['num_previous_abortion'].mask(X_test['num_previous_abortion']<0,np.nan)

In [21]:
#Splitting the data
X_train1,X_val1,y_train1,y_val1= train_test_split(X,y1,stratify=y1,test_size=0.20)
X_train2,X_val2,y_train2,y_val2= train_test_split(X,y2,stratify=y2,test_size=0.20)

In [22]:
# shape of train,validation set
print(X_train1.shape,X_val1.shape,y_train1.shape,y_val1.shape)
print(X_train2.shape,X_val2.shape,y_train2.shape,y_val2.shape)

(13728, 31) (3432, 31) (13728,) (3432,)
(13728, 31) (3432, 31) (13728,) (3432,)


In [23]:
# Missing value imputation
from sklearn.impute import SimpleImputer
imp_mode=SimpleImputer(strategy='most_frequent')
imp_mode_num=SimpleImputer(strategy='most_frequent')
imp_median=SimpleImputer(strategy='median')

In [24]:
pd.options.mode.chained_assignment = None  

In [25]:
# missing value imputation
for i in X.columns:
    if (X[i].dtype.name!='object')&(X[i].nunique()<=3):
        imp_mode_num.fit(np.array(X_train1[i]).reshape(-1,1))
        X_train1[i]=imp_mode_num.transform(np.array(X_train1[i]).reshape(-1,1))
        X_val1[i]=imp_mode_num.transform(np.array(X_val1[i]).reshape(-1,1))
        X_test[i]=imp_mode_num.transform(np.array(X_test[i]).reshape(-1,1))
    elif (X[i].dtype.name!='object')&(X[i].nunique()>3):
        imp_median.fit(np.array(X_train1[i]).reshape(-1,1))
        X_train1[i]=imp_median.transform(np.array(X_train1[i]).reshape(-1,1))
        X_val1[i]=imp_median.transform(np.array(X_val1[i]).reshape(-1,1))
        X_test[i]=imp_median.transform(np.array(X_test[i]).reshape(-1,1))
    else:
        imp_mode.fit(np.array(X_train1[i]).reshape(-1, 1))
        X_train1[i] = imp_mode.transform(np.array(X_train1[i]).reshape(-1, 1))[:, 0]
        X_val1[i] = imp_mode.transform(np.array(X_val1[i]).reshape(-1, 1))[:, 0]
        X_test[i] = imp_mode.transform(np.array(X_test[i]).reshape(-1, 1))[:, 0]


In [26]:
# checking null values
X_train1.isnull().sum()

patient_age                     0
defective_mother                0
defective_father                0
maternal_gene                   0
paternal_gene                   0
blood_cell_count                0
mother_age                      0
father_age                      0
Status                          0
respiratory_rate                0
heart_rate                      0
follow_up                       0
Gender                          0
birth_asphyxia                  0
birth_defect_autopsy            0
birth_place                     0
folic_acid_periconceptional     0
maternal_illness                0
radiation_exposure              0
substance_abuse                 0
assisted_conception             0
previous_pregnancy_anomalies    0
num_previous_abortion           0
birth_defects                   0
WBC_count                       0
blood_test_result               0
Symptom 1                       0
Symptom 2                       0
Symptom 3                       0
Symptom 4     

In [27]:
X_val1.isnull().sum()

patient_age                     0
defective_mother                0
defective_father                0
maternal_gene                   0
paternal_gene                   0
blood_cell_count                0
mother_age                      0
father_age                      0
Status                          0
respiratory_rate                0
heart_rate                      0
follow_up                       0
Gender                          0
birth_asphyxia                  0
birth_defect_autopsy            0
birth_place                     0
folic_acid_periconceptional     0
maternal_illness                0
radiation_exposure              0
substance_abuse                 0
assisted_conception             0
previous_pregnancy_anomalies    0
num_previous_abortion           0
birth_defects                   0
WBC_count                       0
blood_test_result               0
Symptom 1                       0
Symptom 2                       0
Symptom 3                       0
Symptom 4     

In [28]:
X_test.isnull().sum()


patient_age                     0
defective_mother                0
defective_father                0
maternal_gene                   0
paternal_gene                   0
blood_cell_count                0
mother_age                      0
father_age                      0
Status                          0
respiratory_rate                0
heart_rate                      0
follow_up                       0
Gender                          0
birth_asphyxia                  0
birth_defect_autopsy            0
birth_place                     0
folic_acid_periconceptional     0
maternal_illness                0
radiation_exposure              0
substance_abuse                 0
assisted_conception             0
previous_pregnancy_anomalies    0
num_previous_abortion           0
birth_defects                   0
WBC_count                       0
blood_test_result               0
Symptom 1                       0
Symptom 2                       0
Symptom 3                       0
Symptom 4     

In [29]:
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler
ord_enc=OrdinalEncoder()
ohe_enc = OneHotEncoder()  # Set sparse=False to get a dense array
min_max=MinMaxScaler()

In [30]:
# reset index
X_train1.reset_index(inplace=True)
X_val1.reset_index(inplace=True)


In [31]:
# Encoding the features
for i in X.columns:
    if X[i].dtype.name == 'object':
        if i in X and X[i].nunique() <= 2:
            ord_enc.fit(np.array(X_train1[i]).reshape(-1, 1))
            X_train1.loc[:, i] = ord_enc.transform(np.array(X_train1[i]).reshape(-1, 1))
            X_val1.loc[:, i] = ord_enc.transform(np.array(X_val1[i]).reshape(-1, 1))
            X_test.loc[:, i] = ord_enc.transform(np.array(X_test[i]).reshape(-1, 1))
        else:
            # Perform one-hot encoding using pd.get_dummies()
            encoded_features_tr1 = pd.get_dummies(X_train1[i], prefix=i, drop_first=True)
            encoded_features_va1 = pd.get_dummies(X_val1[i], prefix=i, drop_first=True)
            encoded_features_test = pd.get_dummies(X_test[i], prefix=i, drop_first=True)
            
            # Drop the original categorical column before concatenating
            X_train1.drop(columns=[i], inplace=True)
            X_val1.drop(columns=[i], inplace=True)
            X_test.drop(columns=[i], inplace=True)
            
            # Concatenate the encoded features with the main DataFrames
            X_train1 = pd.concat([X_train1, encoded_features_tr1], axis=1)
            X_val1 = pd.concat([X_val1, encoded_features_va1], axis=1)
            X_test = pd.concat([X_test, encoded_features_test], axis=1)

In [32]:
# shape of the train,test,val
X_train1.shape,X_val1.shape,X_test.shape

((13728, 42), (3432, 42), (9290, 41))

In [33]:
X_train1.drop(columns='index',inplace=True)
X_val1.drop(columns='index',inplace=True)

In [34]:
from sklearn.preprocessing import MinMaxScaler
min_max=MinMaxScaler()
X2=min_max.fit_transform(X_train1)

In [35]:
# normalised minmax
X2=pd.DataFrame(X2,columns=X_train1.columns)

In [36]:
#normalised val1
X2_val=min_max.transform(X_val1)
X2_val=pd.DataFrame(X2_val,columns=X_val1.columns)

In [37]:
#normalised test
X2_test=min_max.transform(X_test)
X2_test=pd.DataFrame(X2_test,columns=X_test.columns)

In [38]:
# enoding the target variables1
lab_enc1=LabelEncoder()
y1_en=lab_enc1.fit_transform(y_train1)
y1_en_val=lab_enc1.transform(y_val1)
# printing encoded targets
np.unique(y1_en),np.unique(y1_en_val)
# enoding the target variables2
lab_enc2=LabelEncoder()
y2_en=lab_enc2.fit_transform(y_train2)
y2_en_val=lab_enc2.transform(y_val2)
# printing encoded targets
np.unique(y2_en),np.unique(y2_en_val)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), array([0, 1, 2, 3, 4, 5, 6, 7, 8]))

In [39]:
from imblearn.over_sampling import BorderlineSMOTE
sm = BorderlineSMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X2, pd.DataFrame(y1_en))
print(f'''shape of X before SMOTE: {X2.shape} 
shape of X after SMOTE: {X_sm.shape}''')
print('balanced class (%):')
y_sm.value_counts(normalize=True) * 100

shape of X before SMOTE: (13728, 41) 
shape of X after SMOTE: (21039, 41)
balanced class (%):


0    33.333333
1    33.333333
2    33.333333
Name: proportion, dtype: float64

In [40]:
X_sm.head(2)

Unnamed: 0,patient_age,defective_mother,defective_father,maternal_gene,paternal_gene,blood_cell_count,mother_age,father_age,Status,respiratory_rate,...,birth_defect_autopsy_Yes,radiation_exposure_Not applicable,radiation_exposure_Yes,radiation_exposure_others,substance_abuse_Not applicable,substance_abuse_Yes,substance_abuse_others,blood_test_result_inconclusive,blood_test_result_normal,blood_test_result_slightly abnormal
0,0.714286,1.0,0.0,0.0,0.0,0.671194,0.939394,0.590909,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.142857,1.0,1.0,1.0,1.0,0.467503,0.242424,0.795455,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [41]:
y1_enco=np.array(y_sm).ravel()

In [42]:
sel1=SelectKBest(chi2, k=25).fit(X_sm,y1_enco)

In [43]:
cols=sel1.get_support(indices=True)
print(X_sm.iloc[:,cols].shape)
result_kbest_20=X_sm.iloc[:,cols]

(21039, 25)


In [44]:
sele_fea= X2.columns[(sel1.get_support())]
print(sele_fea)

Index(['defective_mother', 'defective_father', 'maternal_gene',
       'paternal_gene', 'Status', 'respiratory_rate', 'follow_up',
       'folic_acid_periconceptional', 'previous_pregnancy_anomalies',
       'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5',
       'Gender_Male', 'birth_asphyxia_No record',
       'birth_asphyxia_Not available', 'birth_defect_autopsy_Yes',
       'radiation_exposure_Not applicable', 'radiation_exposure_others',
       'substance_abuse_Not applicable', 'substance_abuse_Yes',
       'blood_test_result_inconclusive', 'blood_test_result_normal',
       'blood_test_result_slightly abnormal'],
      dtype='object')


In [45]:
print(X2_val.iloc[:,cols].shape)
result_kbest_val=X2_val.iloc[:,cols]

(3432, 25)


In [46]:
print(X2_test.iloc[:,cols].shape)
result_kbest_test20=X2_test.iloc[:,cols]

(9290, 25)


In [54]:
rfc=RandomForestClassifier(random_state=42)
params1={'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60],
 'max_features': ['sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
random_rfc=RandomizedSearchCV(rfc,param_distributions=params1,verbose=10,n_jobs=-1,random_state=42)
random_rfc.fit(result_kbest_20,y1_enco)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [55]:
print(random_rfc.best_estimator_)

RandomForestClassifier(max_depth=50, min_samples_split=5, n_estimators=1200,
                       random_state=42)


In [56]:
print(random_rfc.best_score_)

0.7061679746141005


In [57]:
rfc = RandomForestClassifier(n_estimators=1800,max_depth=20,max_features='sqrt',bootstrap=False, min_samples_leaf=2, min_samples_split=10,random_state=42)
rfc.fit(result_kbest_20,y1_enco)
cal_clf = CalibratedClassifierCV(rfc, method="sigmoid")
cal_clf.fit(result_kbest_20,y1_enco)
predict_y =cal_clf .predict(result_kbest_20)
print ('The train f1_macro is:',f1_score(y1_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_val)
print('The cross validation f1_macro is:',f1_score(y1_en_val, predict_y,average='macro'))

The train f1_macro is: 0.9185097024913738
The cross validation f1_macro is: 0.5140010087370345


In [58]:
smd=BorderlineSMOTE(random_state=42)
X_smd, y_smd = smd.fit_resample(X2, pd.DataFrame(y2_en))
print(f'''shape of X before SMOTE: {X2.shape} 
shape of X after SMOTE: {X_smd.shape}''')
print('balanced class (%):')
y_smd.value_counts(normalize=True) * 100

shape of X before SMOTE: (13728, 41) 
shape of X after SMOTE: (31923, 41)
balanced class (%):


0    11.111111
1    11.111111
2    11.111111
3    11.111111
4    11.111111
5    11.111111
6    11.111111
7    11.111111
8    11.111111
Name: proportion, dtype: float64

In [59]:
X_smd.head(2)

Unnamed: 0,patient_age,defective_mother,defective_father,maternal_gene,paternal_gene,blood_cell_count,mother_age,father_age,Status,respiratory_rate,...,birth_defect_autopsy_Yes,radiation_exposure_Not applicable,radiation_exposure_Yes,radiation_exposure_others,substance_abuse_Not applicable,substance_abuse_Yes,substance_abuse_others,blood_test_result_inconclusive,blood_test_result_normal,blood_test_result_slightly abnormal
0,0.714286,1.0,0.0,0.0,0.0,0.671194,0.939394,0.590909,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.142857,1.0,1.0,1.0,1.0,0.467503,0.242424,0.795455,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [60]:
y2_enco=np.array(y_smd).ravel()

In [61]:
# feature selection 
sel2=SelectKBest(chi2, k=25).fit(X_smd,y2_enco)

In [62]:
cols=sel2.get_support(indices=True)
print(X_smd.iloc[:,cols].shape)
result_kbest_20d=X_smd.iloc[:,cols]

(31923, 25)


In [63]:
sele_fead= X2.columns[(sel2.get_support())]
print(sele_fead)

Index(['defective_mother', 'maternal_gene', 'paternal_gene', 'Status',
       'respiratory_rate', 'follow_up', 'maternal_illness',
       'assisted_conception', 'previous_pregnancy_anomalies', 'birth_defects',
       'WBC_count', 'Symptom 1', 'Symptom 4', 'Symptom 5', 'Gender_Female',
       'Gender_Male', 'birth_asphyxia_No record',
       'birth_asphyxia_Not available', 'birth_asphyxia_Yes',
       'birth_defect_autopsy_Not applicable', 'birth_defect_autopsy_Yes',
       'substance_abuse_Not applicable', 'substance_abuse_others',
       'blood_test_result_inconclusive',
       'blood_test_result_slightly abnormal'],
      dtype='object')


In [64]:
print(X2_val.iloc[:,cols].shape)
result_kbest_vald=X2_val.iloc[:,cols]

(3432, 25)


In [65]:
print(X2_test.iloc[:,cols].shape)
result_kbest_test20d=X2_test.iloc[:,cols]

(9290, 25)


In [66]:
rfc1=r_cfl=RandomForestClassifier(random_state=42)
params1={'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60],
 'max_features': ['sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10,15],
 'n_estimators': [200, 400, 500, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
random_rfc1=RandomizedSearchCV(rfc1,param_distributions=params1,n_jobs=-1,random_state=42)
random_rfc1.fit(result_kbest_20d,y2_enco)

  warn(


In [67]:
print(random_rfc1.best_estimator_)

RandomForestClassifier(bootstrap=False, max_depth=30, max_features='auto',
                       min_samples_leaf=2, min_samples_split=5,
                       n_estimators=500, random_state=42)


In [68]:
print(random_rfc1.best_score_)

0.6579642479338518


In [69]:
rfc1= RandomForestClassifier(n_estimators=500,max_depth=30,min_samples_leaf=2,min_samples_split=5,bootstrap=False,random_state=42)
rfc1.fit(result_kbest_20d,y2_enco)
cal_clf = CalibratedClassifierCV(rfc1, method="sigmoid")
cal_clf.fit(result_kbest_20d,y2_enco)
predict_y =cal_clf .predict(result_kbest_20d)
print ('The train f1_macro is:',f1_score(y2_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_vald)
print('The cross validation f1_macro is:',f1_score(y2_en_val, predict_y,average='macro'))

The train f1_macro is: 0.99645691450542
The cross validation f1_macro is: 0.10168967221045688
