In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [None]:
!pwd       ##run shell command pwd

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
!mv iris.data iris.csv

In [None]:
!ls

In [None]:
##add columns to the dataframe
columns = ['sepalLengthCm','seppalWidthCm','petalLengthCm','petalWidthCm','species']
df = pd.read_csv("./iris.csv", names = columns)

In [None]:
df

In [None]:
iris_X = df[['sepalLengthCm','seppalWidthCm','petalLengthCm','petalWidthCm']]
iris_X.head()

In [None]:
iris_y = df[['species']]
iris_y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.2, random_state=36)


In [None]:
print("X_train", len(X_train))
print("y_train", len(y_train))

print("X_test", len(X_test))
print("y_test", len(y_test))

In [None]:
DTree_clf = DecisionTreeClassifier(random_state=36, max_depth=2)
DTree_clf.fit(X_train, y_train)

In [None]:
y_predict = DTree_clf.predict(X_test)

In [None]:
y_test[0:5]

In [None]:
y_predict[0:5]

In [None]:
accuracy_score(y_test, y_predict)

In [None]:
!pip install graphviz

In [None]:
from sklearn.tree import export_graphviz
from sklearn import tree
from IPython.display import SVG
from graphviz import Source
from IPython.display import display

In [None]:
tree.plot_tree(DTree_clf)

In [None]:
#graph = Source(tree.export_graphviz(clf))
names = iris_X.columns
print(names)
graph = Source(tree.export_graphviz(DTree_clf ,feature_names = names, class_names = iris_y['species'].unique(), max_depth = 2, filled = True))

In [None]:
display(SVG(graph.pipe(format='svg')))


In [None]:
!wget https://github.com/haiyanwa/ML_workshop/blob/main/Heart.csv
!ls

In [None]:
df_heart = pd.read_csv('/u/home/h/hwang/workshop/ML_workshop/Heart.csv')
df_heart.head()

Thal: Thallium stress test

AHD: Yes indicates the presence of heart disease based on an angiographic test

In [None]:
df_heart.shape

In [None]:
##check if there's any null data
df_heart.isna().any()

In [None]:
print(df_heart.Thal.isnull().sum())
print(df_heart.Ca.isnull().sum())

In [None]:
##drop rows with NaN value
df_heart.dropna(axis=0, subset=['Ca'],inplace=True)

In [None]:
df_heart.shape
#df_heart[df_heart.Thal.isnull()].index.tolist()

In [None]:
df_heart.Ca.unique()

In [None]:
##create target y with AHD column
heart_y = df_heart['AHD']
heart_y

In [None]:
##remove ID and target from X
heart_X = df_heart.drop(columns=['Unnamed: 0', 'AHD'])
heart_X

## Change categorical to numerical


|index|value|
| :-  | :-: |
|0 | Yes|
|1 |No|
|2 |No|
|3 |No|
|4 |Yes|
|5 |No|
|6 |Yes|
|7 |Yes|



Method 1 <br>
Ordinal <br>

|index|value|
| :-  | :-: |
|0 | 1|
|1 |0|
|2 |0|
|3 |0|
|4 |1|
|5 |0|
|6 |1|
|7 |1|

Method 2<br>
OneHotEncoding<br>
Change 1 column into 2 columns<br>  

|index|Yes|No|
| :-  | :-: | :-: |
|0 | 1|0|
|1 |0|1|
|2 |0|1|
|3 |0|1|
|4 |1|0|
|5 |0|1|
|6 |1|0|
|7 |1|0|

In [None]:
##For categorical data, we use get_dummies to transfer them to HotEncoding
X_trans = pd.get_dummies(heart_X, ['ChestPain', 'Thal'])
X_trans

In [None]:
(heart_X_train, heart_X_test, heart_y_train, heart_y_test) = train_test_split(X_trans, heart_y, test_size=0.2, random_state=2)
print(heart_X_train.shape)
print(heart_y_train.shape)

In [None]:
heart_X_train.head()


In [None]:
heart_y_train.head()

In [None]:
##build the model with DecisionTreeClassifier
DTree_clf_heart = DecisionTreeClassifier(criterion='entropy', random_state=2)

##Train with the training data
DTree_clf_heart.fit(heart_X_train, heart_y_train)

In [None]:
##Predict with test data
heart_y_predict = DTree_clf_heart.predict(heart_X_test)

In [None]:
accuracy_score(heart_y_test, heart_y_predict)

In [None]:
DTree_clf_heart.get_depth()

In [None]:
##User GridSearchCV to tune the parameters
from sklearn.model_selection import GridSearchCV
#min_samples_split: The minimum number of samples required to split an internal node
#min_samples_leaf: The minimum number of samples required to be at a leaf node. A split point at any depth
#will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right
#branches.
param_grid = {
    "max_depth": [3, 5, 7, 9],
    "min_samples_split": [10, 20, 30],
    "min_samples_leaf": [5, 10, 20],
}
DTree_clf = DecisionTreeClassifier(criterion='entropy')
grid_cv = GridSearchCV(DTree_clf, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")
_ = grid_cv.fit(heart_X_train, heart_y_train)

In [None]:
grid_cv.best_score_

In [None]:
grid_cv.best_params_

In [None]:
##Random Forest
from sklearn.ensemble import RandomForestClassifier

RForest_clf=RandomForestClassifier(bootstrap=True,random_state=2)
#bootstrap: Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.

RForest_clf.fit(heart_X_train, heart_y_train)
heart_y_predict = RForest_clf.predict(heart_X_test)
accuracy_score(heart_y_test, heart_y_predict)

In [None]:
RForest_clf.get_params()

In [None]:
##Tuning
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 7, 9, 11],
    'max_features': ["sqrt", "log2"],
    'min_samples_leaf': [5, 10],
    'min_samples_split': [10, 20, 30],
    'n_estimators': [100, 200, 300, 500,1000]
}


In [None]:
RForest_clf=RandomForestClassifier(random_state=2)
grid_cv = GridSearchCV(RForest_clf, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")
_ = grid_cv.fit(heart_X_train, heart_y_train)

In [None]:
grid_cv.best_score_

In [None]:
grid_cv.best_params_