# Machine Learning API Creation

### Business Understanding

### Analytical Questions

### Exploratory Data Analysis

In [2]:
# Import the needed packages
import pandas as pd
import numpy as np

# Libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Library for testing the hypothesis
import scipy.stats as stats

# Library for pandas profiling
from pandas_profiling import ProfileReport

# Library for splitting the train data
from sklearn.model_selection import train_test_split

# Library for feature scaling
from sklearn.preprocessing import StandardScaler

# Library for feature encoding
from sklearn.preprocessing import OneHotEncoder

# Libraries for balancing the dataset
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Libraries for modelling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Libraries for model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Library for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Library for working with operating system
import os

# Library to serialize a Python object into a flat byte stream and transform a byte stream back into a Python object
import pickle

# Library to handle warnings
import warnings
warnings.filterwarnings('ignore')

  from pandas_profiling import ProfileReport


In [12]:
# Load the datasets.

train = pd.read_csv('data/Paitients_Files_Train.csv')
test = pd.read_csv('data/Paitients_Files_Test.csv')

In [13]:
# View the first five rows of the train dataset

train.head()

Unnamed: 0,ID,PRG,PL,PR,SK,TS,M11,BD2,Age,Insurance,Sepssis
0,ICU200010,6,148,72,35,0,33.6,0.627,50,0,Positive
1,ICU200011,1,85,66,29,0,26.6,0.351,31,0,Negative
2,ICU200012,8,183,64,0,0,23.3,0.672,32,1,Positive
3,ICU200013,1,89,66,23,94,28.1,0.167,21,1,Negative
4,ICU200014,0,137,40,35,168,43.1,2.288,33,1,Positive


In [14]:
# View the first five rows of the test dataset

test.head()

Unnamed: 0,ID,PRG,PL,PR,SK,TS,M11,BD2,Age,Insurance
0,ICU200609,1,109,38,18,120,23.1,0.407,26,1
1,ICU200610,1,108,88,19,0,27.1,0.4,24,1
2,ICU200611,6,96,0,0,0,23.7,0.19,28,1
3,ICU200612,1,124,74,36,0,27.8,0.1,30,1
4,ICU200613,7,150,78,29,126,35.2,0.692,54,0


The train dataset has a 'Sepssis' column which is absent in the test dataset. This 'Sepssis' column will serve as the target column when training the model.

In [15]:
# Check the number of rows and columns on both datasets.

train.shape, test.shape

((599, 11), (169, 10))

The train dataset has 599 rows and 11 columns, while the test dataset has 169 rows and 10 columns.

In [16]:
# Check the datatypes and the presence of missing values on the train dataset.

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         599 non-null    object 
 1   PRG        599 non-null    int64  
 2   PL         599 non-null    int64  
 3   PR         599 non-null    int64  
 4   SK         599 non-null    int64  
 5   TS         599 non-null    int64  
 6   M11        599 non-null    float64
 7   BD2        599 non-null    float64
 8   Age        599 non-null    int64  
 9   Insurance  599 non-null    int64  
 10  Sepssis    599 non-null    object 
dtypes: float64(2), int64(7), object(2)
memory usage: 51.6+ KB


In [17]:
# Check the datatypes and the presence of missing values on the test dataset.

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         169 non-null    object 
 1   PRG        169 non-null    int64  
 2   PL         169 non-null    int64  
 3   PR         169 non-null    int64  
 4   SK         169 non-null    int64  
 5   TS         169 non-null    int64  
 6   M11        169 non-null    float64
 7   BD2        169 non-null    float64
 8   Age        169 non-null    int64  
 9   Insurance  169 non-null    int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 13.3+ KB


There are no empty cells in both the train and test dataset. And the datatype of each column in both datasets are consistent with each other.

In [19]:
# Confirm that both train and test datasets have no missing values

train.isna().sum().sum(), test.isna().sum().sum()

(0, 0)

In [20]:
# Check for the presence of duplicates on the train and test datasets.

train.duplicated().sum(), test.duplicated().sum()

(0, 0)

### Visualizations

### Answering Analytical Questions

### Modelling

### Hyper-parameter Tuning

### Evaluation

### Deployment