# Data preprocessing

## Importing libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the datasets

In [2]:
df = pd.read_csv('train.csv')
df = df.drop(labels=['hospital_number', 'lesion_2', 'lesion_3', 'cp_data'], axis=1)
X = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

In [3]:
print(X)

[['yes' 'adult' 38.1 ... 3.4 'yes' 2209]
 ['yes' 'adult' 37.5 ... 2.0 'yes' 2208]
 ['yes' 'adult' 38.3 ... 3.4 'yes' 5124]
 ...
 ['yes' 'young' 37.5 ... 7.0 'yes' 400]
 ['yes' 'adult' 38.1 ... 2.0 'yes' 2209]
 ['yes' 'adult' 38.1 ... 3.6 'yes' 2124]]


## Taking care of missing data

In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values="None", strategy='most_frequent')
imputer.fit(X)
X = imputer.transform(X)

## Encoding categorical data

### Encoding the independent variables

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first', sparse_output=False), [0, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 19, 21])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Encoding the dependent variable

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(y)

array([0, 1, 2, ..., 2, 2, 2])

## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
print(X)

[[1.0 0.0 1.0 ... 8.5 3.4 2209]
 [1.0 0.0 1.0 ... 64.0 2.0 2208]
 [1.0 0.0 1.0 ... 6.4 3.4 5124]
 ...
 [1.0 1.0 0.0 ... 5.9 7.0 400]
 [1.0 0.0 0.0 ... 74.0 2.0 2209]
 [1.0 0.0 0.0 ... 6.0 3.6 2124]]


In [9]:
np.shape(X)

(1235, 50)

## Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Training the Decision Tree Classification model on the Training set

In [11]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(criterion = 'entropy', n_estimators=100)
classifier.fit(X_train, y_train)

# Predicting the Test set results

In [12]:
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[64  3 14]
 [ 4 29 12]
 [25 12 84]]


0.7165991902834008