In [114]:
##-------------------------------------
## Step 0) Load Libraries
##-------------------------------------
## Array Manipulation
import numpy as np                       
## Data Frame Manipulation
import pandas as pd                      
## Change Directory in Python
import os                                
## To Split into test and train
from sklearn.model_selection import train_test_split 
## SVM
from sklearn.svm import SVC 
## CART
from sklearn import tree
## Use Confusion Matrix
from sklearn.metrics import confusion_matrix 
## Visualize Tree
import graphviz 
## Save Image
from matplotlib.pyplot import *        ## Plot
%matplotlib inline

In [139]:
##-------------------------------------
## Step 2) Functions
##-------------------------------------
## Function to evaluate predictions
def imp_metrics(TP,TN,FP,FN):
    """
   Function Name: imp_metrics
   Input: TP,FN,TP,FP
   Output: Accuracy, Recal, Precision, F1
   Purpose: Find metrics to understand recovery of SVM
    
    """
    ## Calculate Accuracy
    accuracy = (TP + TN) / (TP + TN + FN + FP)
    
    ## Calculate Recall
    if(TP != 0 and FN != 0):
        recall = TP/(TP+FN)
    else:
        recall = np.nan

    ## Calculate Precision
    if(TP != 0 and FP != 0):
        precision = TP/(TP+FP)
    else:
        precision =np.nan

    ## Calculate F1
    if(precision == np.nan and recall == np.nan):
        F1 = np.nan
    else:
        F1 = 2*((precision*recall)/(precision+recall))
        
    ## Concatenate    
    metrics = np.array([accuracy,recall,precision,F1])
    return(metrics.round(2))



## Format Metrics
def provide_results(cf_matrix,algo):
    """Uses confusion matrix to find 4 important metrics"""
    ## Find True Positive & Negative and False Positive & Negative
    TP = cf_matrix[0,0]
    TN = cf_matrix[1,1]
    FP = cf_matrix[0,1]
    FN = cf_matrix[1,0] 
    ## Calculate Accuracy, Recal, Precision, F1
    metrics = imp_metrics(TP,TN,FP,FN)
    print('\n',
          algo + ' Accuracy : ',  metrics[0],'\n',
          algo + ' Recall: ',     metrics[1],'\n',
          algo + ' Precision: ',  metrics[2],'\n',
          algo + ' F1: ',         metrics[3])
    
def split_into_groups(x,type_split):
    """Split cut data based on splits"""
    
    if(type_split == 'median'): ## Based on Median
        median_value = np.median(x)
        new_x = np.where(x > median_value , 1, 0) 
    elif(type_split == 'percentile'):  ## Based on percentile
        percents = np.percentile(x,[25,75])
        new_x = np.where(x < percents[0] , 1,
                         np.where((x > percents[0]) & (x < percents[1]),2,3))
    return(new_x)
        

In [144]:
##-------------------------------------
## Step 1) Load Data
##-------------------------------------
default_path = 'C:/Users/james/OneDrive/Documents/Important_Files/Favorite_5_ML algorithms'
os.chdir(default_path)
king = pd.read_csv('king.csv', index_col='id')

In [145]:
##-------------------------------------
## Step 3) Curate Data
##-------------------------------------
## 1) Make Yes/No features into 1/0 features
king['low'] = king['low'].map(dict(Yes=1, No=0))
king['firstep'] = king['firstep'].map(dict(Yes=1, No=0))
king['smoker'] = king['smoker'].map(dict(Y=1, N=0))
king['drinker'] = king['drinker'].map(dict(Y=1, N=0))

## 2) Make Gender into 1:Female and 0:Male
king['gender'] = king['gender'].map(dict(F=1, M=0))

## 3) Make Race into dummy variables and drop race feature
race_variables = pd.get_dummies(king['race'])
king = pd.concat([king,race_variables],axis = 1)
king = king.drop(columns=['race'])

## 4) Make Educational Level into dummy 
##    variables as well and drop educlv
educlv_variables = pd.get_dummies(king['educlv'])
king = pd.concat([king,educlv_variables],axis = 1)
king = king.drop(columns=['educlv'])

## 5) Make birth weight into a three levels variable 
##    based on percentiles and drop bwt
king['bwt_class'] = split_into_groups(king['bwt'],'percentile')
king = king.drop(columns=['bwt','low'])


In [146]:
##-------------------------------------
## Step 4) Split Data into Training and 
## Test data frames
##-------------------------------------
## Split DF into Features and Labels
response_variable = 'bwt_class'
y = king[response_variable]
X = king.drop(columns=[response_variable])

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=44, 
                                                    test_size = 0.26)

In [147]:
##-------------------------------------
## Step 5) Implement SVM and view metrics
##-------------------------------------

## SVM specifications
clf_svm = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape='ovr', degree=2, gamma='auto', kernel='rbf',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False)
## Run SVM algorithm
clf_svm=  clf_svm.fit(X_train, y_train)
## Predict y_est_svm
y_est_svm = clf_svm.predict(X_test) 
## Determine Confusion matrix
cf_matrix_svm = confusion_matrix(y_est_svm,y_test)
## Display Metrics
provide_results(cf_matrix_svm,'SVM')


 SVM Accuracy :  0.65 
 SVM Recall:  0.1 
 SVM Precision:  0.57 
 SVM F1:  0.17


In [79]:
##-------------------------------------
## Step 6) Visualize SVM
##-------------------------------------

In [148]:
##-------------------------------------
## Step 7) Implement CART and view metrics
##-------------------------------------
## CART specifications
clf = tree.DecisionTreeClassifier(max_depth=2)
## Run CART algorithm
clf_cart = clf.fit(X_train, y_train) 
## Predict y_est_cart
y_est_cart = clf_cart.predict(X_test)
## Determine Confusion matrix
cf_matrix_cart = confusion_matrix(y_est_cart,y_test)
## Display Metrics
provide_results(cf_matrix_cart,'CART')




 CART Accuracy :  0.75 
 CART Recall:  0.32 
 CART Precision:  0.87 
 CART F1:  0.47


In [149]:
##-------------------------------------
## Step 8) Visualize CART
##-------------------------------------
dot_data = tree.export_graphviz(clf_cart, out_file=None, 
                          feature_names=X_train.columns,  
                          class_names=['Light','Medium','Heavy'],  
                          filled=True, rounded=True,  
                          special_characters=True)
graph = graphviz.Source(dot_data)
graph.render('tree_length2')


'tree_length2.pdf'

## Ploting SVM
http://scikit-learn.org/stable/auto_examples/svm/plot_svm_margin.html
http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html
## Data
http://courses.washington.edu/b517/Datasets/datasets.html

## Make better choices with Decision Trees
http://scikit-learn.org/stable/modules/tree.html#classification-criteria

## Data Reduction
http://scikit-learn.org/stable/modules/decomposition.html#ica

## View Decision Trees better
http://scikit-learn.org/stable/auto_examples/tree/plot_iris.html#sphx-glr-auto-examples-tree-plot-iris-py