In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.filterwarnings(action='ignore') # to not show convergence warning

In [2]:
data = pd.read_csv("Datasets\Health\data-ori.csv")

In [3]:
data.head()

# All columns are continious except the except the Sex column so will need to change it to 0s and 1s in preprocessing.

# No need to change Source (target column) as sklearn caters to strings in columns.

# All continious numeric columns have different ranges so we will use standard scaler to standardize the columns.
# Other scalers can be used here as well to produce better results
# Standard scaler makes it so all columns have a mean of 0 and variance of 1

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,F,out
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,F,out
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,F,out
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,F,out
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,M,out


In [4]:
data.info() # clean data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4412 entries, 0 to 4411
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HAEMATOCRIT   4412 non-null   float64
 1   HAEMOGLOBINS  4412 non-null   float64
 2   ERYTHROCYTE   4412 non-null   float64
 3   LEUCOCYTE     4412 non-null   float64
 4   THROMBOCYTE   4412 non-null   int64  
 5   MCH           4412 non-null   float64
 6   MCHC          4412 non-null   float64
 7   MCV           4412 non-null   float64
 8   AGE           4412 non-null   int64  
 9   SEX           4412 non-null   object 
 10  SOURCE        4412 non-null   object 
dtypes: float64(7), int64(2), object(2)
memory usage: 379.3+ KB


# Preprocessing

In [5]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Binary encoding: Replacing sex column with 1s and 0s
    df['SEX'] = df['SEX'].replace({'F': 0, 'M': 1})
    
    # Define X and y
    y = df['SOURCE']
    X = df.drop('SOURCE', axis=1)
    
    # Train-test split. Shuffle means it'll shuffle dataset before making the split. random_state ensures that shuffle is done the same way, 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1)
    
    # Using standard scaler to standardize continious column so they all have a mean of 0 and variance of 1
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns) # Since scaler returns numpt array; we add the pandas to get pandas dataframe
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test, X

In [6]:
X_train, X_test, y_train, y_test, X = preprocess_inputs(data)

In [7]:
X_train.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX
709,0.391263,0.257091,0.007699,-0.12209,0.566558,0.371856,-0.432786,0.648011,-1.042297,-1.039605
2917,-1.174921,-0.898732,-1.241813,2.859805,-0.470312,0.822437,1.201741,0.413877,0.580372,0.961904
1185,-0.770744,-1.04321,-0.145302,0.131274,1.646994,-1.61821,-1.495229,-1.268961,-0.764125,-1.039605
718,1.182776,1.220276,0.989459,-0.784733,-0.043366,0.184114,0.384478,0.062676,-1.042297,0.961904
332,-2.286407,-2.054554,-2.555076,-0.784733,1.525009,1.911341,0.874836,1.804047,-1.505917,-1.039605


In [8]:
y_train.value_counts() # little imbalance

out    2110
in     1419
Name: SOURCE, dtype: int64

# Training & Results

In [9]:
models = {
    "Logistic Regression": LogisticRegression(),
    "      Decision Tree": DecisionTreeClassifier(),
    "     Neural Network": MLPClassifier(),
    "      Random Forest": RandomForestClassifier(),
    "  Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

Logistic Regression trained.
      Decision Tree trained.
     Neural Network trained.
      Random Forest trained.
  Gradient Boosting trained.


In [10]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(name + " Accuracy: {:.2f}%".format(acc*100))

Logistic Regression Accuracy: 70.67%
      Decision Tree Accuracy: 66.36%
     Neural Network Accuracy: 73.61%
      Random Forest Accuracy: 72.48%
  Gradient Boosting Accuracy: 74.18%


In [11]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, pos_label='in') # getting f1 score for the minority class
    print(name + " F1 Score: {:.5f}".format(f1))

Logistic Regression F1 Score: 0.59969
      Decision Tree F1 Score: 0.59592
     Neural Network F1 Score: 0.66183
      Random Forest F1 Score: 0.65136
  Gradient Boosting F1 Score: 0.65766


# Balancing Training Data

In [12]:
from imblearn.over_sampling import SMOTE

In [13]:
#SMOTE the training data
sm = SMOTE(random_state=1)
X_bal, y_bal = sm.fit_resample(X_train, y_train)

In [14]:
y_bal.value_counts()

out    2110
in     2110
Name: SOURCE, dtype: int64

# Training & Results After Balancing

In [15]:
models = {
    "Logistic Regression": LogisticRegression(),
    "      Decision Tree": DecisionTreeClassifier(),
    "     Neural Network": MLPClassifier(),
    "      Random Forest": RandomForestClassifier(),
    "  Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_bal, y_bal)
    print(name + " trained.")

Logistic Regression trained.
      Decision Tree trained.
     Neural Network trained.
      Random Forest trained.
  Gradient Boosting trained.


In [16]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(name + " Accuracy: {:.2f}%".format(acc*100))

Logistic Regression Accuracy: 70.67%
      Decision Tree Accuracy: 62.97%
     Neural Network Accuracy: 73.61%
      Random Forest Accuracy: 73.61%
  Gradient Boosting Accuracy: 72.71%


In [17]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, pos_label='in') # getting f1 score for the minority class
    print(name + " F1 Score: {:.5f}".format(f1))

Logistic Regression F1 Score: 0.67174
      Decision Tree F1 Score: 0.56917
     Neural Network F1 Score: 0.68556
      Random Forest F1 Score: 0.68385
  Gradient Boosting F1 Score: 0.67564


In [35]:
pred = MLPClassifier()

In [36]:
pred.fit(X_bal, y_bal)

MLPClassifier()

# Prediction

In [37]:
from collections import OrderedDict

In [38]:
df2 = OrderedDict([
    ('HAEMATOCRIT', 40),
    ('HAEMOGLOBINS',12),
    ( 'ERYTHROCYTE',3),
    ('LEUCOCYTE',8),
    ( 'THROMBOCYTE',360),
    ( 'MCH', 28),
    ('MCHC', 40),
    ( 'MCV', 65),
    ('AGE', 2),
    ('SEX', 'F'),
])

df2 = pd.Series(df2).values.reshape(1,-1)

In [39]:
pred.predict(df2)

ValueError: could not convert string to float: 'F'