<a href="https://colab.research.google.com/github/hsuemily/CE888_Hsu-Chi-Rou_1900759/blob/master/Assignment/Dataset3_SafeDriver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import Library

In [0]:
# Import library
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler

## 2.  Load Dataset

In [0]:
# Load data
# Colab library to upload files to notebook, Ref:https://medium.com/@saedhussain/google-colaboratory-and-kaggle-datasets-b57a83eb6ef8
from google.colab import files

# Install Kaggle library
!pip install -q kaggle

# Load data from Kaggle to Google Colab virtual machine
os.environ['KAGGLE_USERNAME'] = "emilyhsucr" # username from the json file
os.environ['KAGGLE_KEY'] = "3b6aa5aa02b468c89e4cfac52a5d992e" # key from the json file
!kaggle competitions download -c porto-seguro-safe-driver-prediction # api copied from kaggle

In [0]:
## Porto Seguro’s Safe Driver Prediction 
SafeDriver = pd.read_csv('train.csv.zip')

In [0]:
SafeDriver.head(5)

In [0]:
SafeDriver.info()

## 3. Data Visualization

In [0]:
# Plot histogram of the label target in Safe Driver dataset
pd.Series(SafeDriver['target']).value_counts().plot(kind='bar',title='SafeDriver: Count (target)')
target_count = SafeDriver.target.value_counts()
print('will not initiate an auto insurance claim (0):', target_count[0])
print('will initiate an auto insurance claim (1):', target_count[1])
print('Imbalance rate:', round((target_count[0] / (target_count[0]+target_count[1]))*100, 2), '%')

## 4. Data Preprocessing
 - 4.1 Missing Value
 - 4.2 Normalization

### 4.1 Missing Value

In [0]:
# dealing with missing value

#*****SafeDriver
total = SafeDriver.isnull().sum().sort_values(ascending=False)
# summarize the amount of missing value in each attributes, and then sort it by descending order
# percent = the counts of missing value /the counts of item (in each attribute)
percent = (SafeDriver.isnull().sum()/SafeDriver.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
missing_data.head()

### 4.2 Normalization

In [0]:
# Normalize dataset - SafeDriver
SafeDriver_copy = SafeDriver.copy()
for col in SafeDriver_copy.select_dtypes(include='number').columns:
    mms = MinMaxScaler()
    SafeDriver_copy[col] = mms.fit_transform(SafeDriver_copy[[col]])
SafeDriver_copy.describe()

# separate the data in to x and y
SafeDriver_X = SafeDriver_copy.drop('target', axis = 1)
SafeDriver_X=SafeDriver_X.values
SafeDriver_Y = np.array(SafeDriver_copy['target'])
print("SafeDriver:",SafeDriver_X)
print(SafeDriver_X.shape)
print(SafeDriver_Y.shape)

## 5. Supervised Learning
### 5.1 Decision tree (cross-validation)
### 5.2 Random forest (cross-validation)

In [0]:
## Decision tree 

### Ref_1: https://www.kaggle.com/sudhirnl7/logistic-regression-with-stratifiedkfold
### Ref_2: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#
#Import library
from sklearn import tree
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold

Skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=49)
pred_test_full =0
cv_percisionDT =[]
cv_recallDT =[]
cv_F1scoreDT =[]
cv_scoreDTRoc = []
i=1
for train_index,test_index in Skf.split(SafeDriver_X,SafeDriver_Y):
    print('{} of KFold {}'.format(i,Skf.n_splits))
    x_tr,x_v = SafeDriver_X[train_index],SafeDriver_X[test_index]
    y_tr,y_v = SafeDriver_Y[train_index],SafeDriver_Y[test_index]
    
    #model
    modelDT = tree.DecisionTreeClassifier(criterion = "entropy",random_state=1)
    # model = tree.DecisionTreeClassifier(max_depth=5,max_leaf_nodes=15, max_features=14, criterion = "entropy",random_state=1)
    modelDT.fit(x_tr,y_tr)
    percisionDT=precision_score(y_v,modelDT.predict(x_v))
    recallDT=recall_score(y_v,modelDT.predict(x_v))
    F1scoreDT=f1_score(y_v,modelDT.predict(x_v))
    scoreDTRoc = roc_auc_score(y_v,modelDT.predict(x_v))

    cv_percisionDT.append(percisionDT)
    cv_recallDT.append(recallDT)
    cv_F1scoreDT.append(F1scoreDT)
    cv_scoreDTRoc.append(scoreDTRoc)

    print('Confusion matrix\n',confusion_matrix(y_v,modelDT.predict(x_v)))
    print('Precision:', precision_score(y_v,modelDT.predict(x_v)))
    print('Recall:',recall_score(y_v,modelDT.predict(x_v)))
    print('F1 score:', f1_score(y_v,modelDT.predict(x_v)))
    print('ROC AUC score:',scoreDTRoc)

    i+=1 

In [0]:
from statistics import mean, stdev
print("Mean of percision:", round(mean(cv_percisionDT),3), '\nStandard Deviation of percision:', round(stdev(cv_percisionDT),3))
print("Mean of recall:", round(mean(cv_recallDT),3), '\nStandard Deviation of recall:', round(stdev(cv_recallDT),3))
print("Mean of F1score:", round(mean(cv_F1scoreDT),3), '\nStandard Deviation of F1score:', round(stdev(cv_F1scoreDT),3))
print("Mean of Roc score:", round(mean(cv_scoreDTRoc),3), '\nStandard Deviation of Roc score:', round(stdev(cv_scoreDTRoc),3))

In [0]:
## Random forest
from sklearn.ensemble import RandomForestClassifier
cv_percisionRF =[]
cv_recallRF =[]
cv_F1scoreRF =[]
cv_scoreRFRoc = []
i=1
for train_index,test_index in Skf.split(SafeDriver_X,SafeDriver_Y):
    print('{} of KFold {}'.format(i,Skf.n_splits))
    x_tr,x_v = SafeDriver_X[train_index],SafeDriver_X[test_index]
    y_tr,y_v = SafeDriver_Y[train_index],SafeDriver_Y[test_index]
    
    #model
    modelRF = RandomForestClassifier(random_state=1)
    modelRF.fit(x_tr,y_tr)

    percisionRF=precision_score(y_v,modelRF.predict(x_v))
    recallRF=recall_score(y_v,modelRF.predict(x_v))
    F1scoreRF=f1_score(y_v,modelRF.predict(x_v))
    scoreRFRoc = roc_auc_score(y_v,modelRF.predict(x_v))

    cv_percisionRF.append(percisionRF)
    cv_recallRF.append(recallRF)
    cv_F1scoreRF.append(F1scoreRF)
    cv_scoreRFRoc.append(scoreRFRoc)

    print('Confusion matrix\n',confusion_matrix(y_v,modelRF.predict(x_v)))
    print('Precision:', precision_score(y_v,modelRF.predict(x_v)))
    print('Recall:',recall_score(y_v,modelRF.predict(x_v)))
    print('F1 score:', f1_score(y_v,modelRF.predict(x_v)))
    print('ROC AUC score:',scoreRFRoc)

    i+=1

In [0]:
from statistics import mean, stdev
print("Mean of percision:", round(mean(cv_percisionRF),3), '\nStandard Deviation of percision:', round(stdev(cv_percisionRF),3))
print("Mean of recall:", round(mean(cv_recallRF),3), '\nStandard Deviation of recall:', round(stdev(cv_recallRF),3))
print("Mean of F1score:", round(mean(cv_F1scoreRF),3), '\nStandard Deviation of F1score:', round(stdev(cv_F1scoreRF),3))
print("Mean of Roc score:", round(mean(cv_scoreRFRoc),3), '\nStandard Deviation of Roc score:', round(stdev(cv_scoreRFRoc),3))

## 6. Unsupervised Learning
6.1  Using the Elbow method and the Silhouette method, identify the number of clusters in the dataset.

6.2 K-mean method and save the information

### 6.1 Using the Elbow method and the Silhouette method, identify the number of clusters in the dataset.

In [0]:
## Elbow method (K-mean method) (stratified cross-validation)
### Ref: https://www.kaggle.com/abhishekyadav5/kmeans-clustering-with-elbow-method-and-silhouette
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

Skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=49)
pred_test_full =0
i=1
scoreE1_all=[]
for train_index,test_index in Skf.split(SafeDriver_X,SafeDriver_Y):
    print('{} of KFold {}'.format(i,Skf.n_splits))
    x_tr,x_v = SafeDriver_X[train_index],SafeDriver_X[test_index]
    y_tr,y_v = SafeDriver_Y[train_index],SafeDriver_Y[test_index]

    scoreEl = []
    for cluster in range(1,9):
        kmeans = KMeans(n_clusters = cluster, init="k-means++", random_state=49)
        kmeans.fit(x_tr)
        scoreEl.append(kmeans.inertia_)
    scoreE1_all.append(scoreEl)

    # plotting the score

    plt.plot(range(1,9), scoreEl, 'g-o')
    plt.title('The Elbow Method')
    plt.xlabel('no of clusters')
    plt.ylabel('Total within-cluster sum of square')
    plt.show()
    i+=1
    ## Total within-cluster sum of square: https://www.jamleecute.com/partitional-clustering-kmeans-kmedoid/

In [0]:
## Silhouette score
### Ref_1: https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
### Ref_2: https://www.kaggle.com/abhishekyadav5/kmeans-clustering-with-elbow-method-and-silhouette

import matplotlib.cm as cm

silhouette_avg_list = []
n_clusters_list = []
for n_clusters in range(2,9):
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(x_tr) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=49)
    cluster_labels = clusterer.fit_predict(x_tr)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(x_tr, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    silhouette_avg_list += [silhouette_avg]
    n_clusters_list += [n_clusters]
    
    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(x_tr, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(x_tr[:, 2], x_tr[:, 8], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 2], centers[:, 8], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[2], c[8], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 3rd feature")
    ax2.set_ylabel("Feature space for the 9th feature")
    # ax2.set_xlabel("Feature space for the 1st feature")
    # ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()

In [0]:
plt.plot(n_clusters_list, silhouette_avg_list, 'g-o')
plt.title('The Silhouette method')
plt.xlabel('no of clusters')
plt.ylabel('The Silhouette score')
plt.show()
print(silhouette_avg_list)

### 6.2 K-mean method and save the information

In [0]:
## Ref: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=49).fit(x_tr)
kmLabels = kmeans.labels_
Label = [np.count_nonzero(kmLabels==0),np.count_nonzero(kmLabels==1)]
centroids = kmeans.cluster_centers_
print('Label:', Label)
print('Centroids',centroids)

## 7. Mixture Method
A new approach to dealing with imbalanced datasets, based on a mixture of
supervised and unsupervised learning.

In [0]:
## Partition each of the datasets into 10 bins
## , keeping the imbalance ratio from the original dataset 
## Ref: https://towardsdatascience.com/k-means-clustering-algorithm-applications-evaluation-methods-and-drawbacks-aa03e644b48a

Skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=49)
pred_test_full =0
Label_all = []
centroids_all = []

cv_percisionRFnew =[]
cv_recallRFnew =[]
cv_F1scoreRFnew =[]
cv_scoreRFRocnew = []

ConfusionMatrix_all = []

i=1

## Use different 9 bins in training dataset,
## and remain one to be the testing dataset
for train_index,test_index in Skf.split(SafeDriver_X,SafeDriver_Y):
    print('{} of KFold {}'.format(i,Skf.n_splits))
    x_tr,x_v = SafeDriver_X[train_index],SafeDriver_X[test_index]
    y_tr,y_v = SafeDriver_Y[train_index],SafeDriver_Y[test_index]

    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=2, random_state=49).fit(x_tr)
    kmLabels = kmeans.labels_
    ## save the centroid and the number of samples in each cluster in each 9 bins
    Label = [np.count_nonzero(kmLabels==0),np.count_nonzero(kmLabels==1)]
    Label_all += [Label]
    centroids = kmeans.cluster_centers_
    centroids_all += [centroids]

    ## plot the distribution of samples and the centroids
    plt.figure(figsize=(8,5))
    plt.title("SafeDriver of data points" "(%d of KFold 10)" %i, fontsize=18)
    plt.grid(True)
    plt.scatter(x_tr[kmeans.labels_ == 0, 2], x_tr[kmeans.labels_ == 0, 8],
                c='purple', label='cluster 0')
    plt.scatter(x_tr[kmeans.labels_ == 1, 2], x_tr[kmeans.labels_ == 1, 8],
                c='yellow', label='cluster 1')
    plt.scatter(centroids[0, 2], centroids[0, 8], marker='*', s=300, c='g', label='centroid 0')
    plt.scatter(centroids[1, 2], centroids[1, 8], marker='*', s=300, c='r', label='centroid 1')
    plt.legend()
    plt.savefig('SafeDriver_Kmean_' + str(i) + 'of K fold 10.png', dpi=300)
    plt.show()

    ## train and test the dataset in random forest model
    modelRFnew = RandomForestClassifier(random_state=42)
    modelRFnew.fit(x_tr,kmLabels)

    percisionRFnew=precision_score(y_v,modelRFnew.predict(x_v))
    recallRFnew=recall_score(y_v,modelRFnew.predict(x_v))
    F1scoreRFnew=f1_score(y_v,modelRFnew.predict(x_v))
    scoreRFRocnew = roc_auc_score(y_v,modelRFnew.predict(x_v))

    cv_percisionRFnew.append(percisionRFnew)
    cv_recallRFnew.append(recallRFnew)
    cv_F1scoreRFnew.append(F1scoreRFnew)
    cv_scoreRFRocnew.append(scoreRFRocnew)

    print('Confusion matrix\n',confusion_matrix(y_v,modelRFnew.predict(x_v)))
    print('Precision:', precision_score(y_v,modelRFnew.predict(x_v)))
    print('Recall:',recall_score(y_v,modelRFnew.predict(x_v)))
    print('F1 score:', f1_score(y_v,modelRFnew.predict(x_v)))
    print('ROC AUC score:',scoreRFRocnew)
    
    from sklearn.metrics import confusion_matrix    
    ConfusionMatrix = confusion_matrix(y_v, modelRFnew.predict(x_v))
    ConfusionMatrix_all += [ConfusionMatrix]
    i+=1

In [0]:
from statistics import mean, stdev
print("Mean of percision:", round(mean(cv_percisionRFnew),3), '\nStandard Deviation of percision:', round(stdev(cv_percisionRFnew),3))
print("Mean of recall:", round(mean(cv_recallRFnew),3), '\nStandard Deviation of recall:', round(stdev(cv_recallRFnew),3))
print("Mean of F1score:", round(mean(cv_F1scoreRFnew),3), '\nStandard Deviation of F1score:', round(stdev(cv_F1scoreRFnew),3))
print("Mean of Roc score:", round(mean(cv_scoreRFRocnew),3), '\nStandard Deviation of Roc score:', round(stdev(cv_scoreRFRocnew),3))

##  8.Compare Results

In [0]:
## Ref:https://plotly.com/python/box-plots/
## A boxplot of the cross-validation results for each method
import plotly.graph_objects as go

x = ['Percision', 'Percision', 'Percision', 'Percision', 'Percision', 'Percision', 'Percision', 'Percision', 'Percision', 'Percision', 'Percision', 'Percision',
     'Recall', 'Recall', 'Recall', 'Recall', 'Recall', 'Recall', 'Recall', 'Recall', 'Recall', 'Recall', 'Recall', 'Recall',
     'F1 Score', 'F1 Score', 'F1 Score', 'F1 Score', 'F1 Score', 'F1 Score', 'F1 Score', 'F1 Score', 'F1 Score', 'F1 Score', 'F1 Score', 'F1 Score',
     'ROC-AUC Score', 'ROC-AUC Score', 'ROC-AUC Score', 'ROC-AUC Score', 'ROC-AUC Score', 'ROC-AUC Score', 'ROC-AUC Score', 'ROC-AUC Score', 'ROC-AUC Score', 'ROC-AUC Score', 'ROC-AUC Score', 'ROC-AUC Score']

fig = go.Figure()

fig.add_trace(go.Box(
    y=cv_percisionDT+cv_recallDT+cv_F1scoreDT+cv_scoreDTRoc,
    x=x,
    name='Decision Tree',
    marker_color='#3D9970'
))
fig.add_trace(go.Box(
    y=cv_percisionRF+cv_recallRF+cv_F1scoreRF+cv_scoreRFRoc,
    x=x,
    name='Random Forest',
    marker_color='#FF4136'
))
fig.add_trace(go.Box(
    y=cv_percisionRFnew+cv_recallRFnew+cv_F1scoreRFnew+cv_scoreRFRocnew,
    x=x,
    name='Mixture Method',
    marker_color='#FF851B'
))


fig.update_layout(
    yaxis_title='Score',
    boxmode='group' # group together boxes of the different traces for each value of x
)
fig.show()