### Agenda of this tutorial
- Data Mining
    1. Create your training data set
    2. Train your data set
        - 2.1 Decision Tree
        - 2.2 Naive bayes
        - 2.3 SVM
    3. Titanic Data Set
        - 3.1 Decision Tree
        - 3.2 Parameter Tuning of Decision Tree
    4. Evaluation
        - 4.1 Confusion_Matrix
        - 4.2 Classification Report
        - 4.3 Cross validation
    5. อ้างอิง บทสรุป

## 1. Create training data set

In [None]:
import numpy as np
import pandas as pd

X = pd.read_csv('datasets/weather.csv')
X[:10]

In [None]:
#'outlook','temperature','humidity','windy'
X_weather = np.array([
    [3,85,85,2],
    [3,80,90,1],
    [1,83,86,2],
    [2,70,96,2],
    [2,68,80,2],
    [2,65,70,1],
    [1,64,65,1],
    [3,72,95,2],
    [3,69,70,2],
    [2,75,80,2],
    [3,75,70,1],
    [1,72,90,1],
    [1,81,75,2],
    [2,71,91,1]
])

y_weather = [0,0,1,1,1,0,1,0,1,1,1,1,1,0]

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(X_weather[:,1], X_weather[:,2],c=y_weather, cmap=plt.cm.Paired)

## 2. Train your data set 

### 2.1 Train your data set with Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_weather, y_weather)

In [None]:
clf.predict([ [2,71,91,2] ])

In [None]:
feature_names = ['outlook','temperature','humidity','windy']
class_names = ['no','yes']

import os
from io import StringIO
from sklearn.tree import export_graphviz
import pydotplus #conda install -c conda-forge pydotplus=2.0.2
from IPython.display import Image

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                     feature_names=feature_names,  
                     class_names=class_names,  
                     filled=True, rounded=True,  
                     special_characters=True)  

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

### 2.2 Train your data set with Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_clf = GaussianNB()
nb_clf.fit(X_weather, y_weather)

In [None]:
nb_clf.predict([ [2,71,91,1] ])

### 2.3 Train your data set with SVM

In [None]:
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_weather, y_weather)

In [None]:
svm_clf.predict([ [2,71,91,2] ])

## 3. Train Titanic Data set

### 3.1 Decision Tree

In [None]:
import numpy as np
import pandas as pd

X = pd.read_csv('datasets/titanic_train_preprocess.csv')
X[:10]

In [None]:
X_titanic = X[['Pclass','Sex','Age','Fare','SibSp','Parch']]
y_titanic = X['Survived']

In [None]:
X_titanic = np.array(X_titanic)

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_titanic, y_titanic)

In [None]:
clf.predict([ [3,0,25,91,1,1] ])

In [None]:
feature_names = ['Pclass','Sex','Age','Fare','SibSp','Parch']
class_names = ['no survived','Survived']

import os
from io import StringIO
from sklearn.tree import export_graphviz
import pydotplus #conda install -c conda-forge pydotplus=2.0.2
from IPython.display import Image

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                     feature_names=feature_names,  
                     class_names=class_names,  
                     filled=True, rounded=True,  
                     special_characters=True)  

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

### 3.2 Parameter Tuning of Decision Tree

In [None]:
clf = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
clf = clf.fit(X_titanic, y_titanic)

feature_names = ['Pclass','Sex','Age','Fare','SibSp','Parch']
class_names = ['no survived','Survived']

import os
from io import StringIO
from sklearn.tree import export_graphviz
import pydotplus #conda install -c conda-forge pydotplus=2.0.2
from IPython.display import Image

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                     feature_names=feature_names,  
                     class_names=class_names,  
                     filled=True, rounded=True,  
                     special_characters=True)  

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

## 4. Evalution method

### 4.1 Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]

confusion_matrix(y_true, y_pred)

In [None]:
y_true = [0, 0, 0, 1, 1, 1, 1, 1]
y_pred = [0, 1, 0, 1, 0, 1, 0, 1]
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
tn, fp, fn, tp

In [None]:
from sklearn.metrics import classification_report
y_true = [0, 1, 2, 2, 0]
y_pred = [0, 0, 2, 1, 0]
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_true, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.67      1.00      0.80         2
    class 1       0.00      0.00      0.00         1
    class 2       1.00      0.50      0.67         2

avg / total       0.67      0.60      0.59         5


### 4.2 Classification Report

### 4.3 Cross validation