In [82]:
##-------------------------------------
## Step 0) Load Libraries
##-------------------------------------
## Array Manipulation
import numpy as np                       
## Data Frame Manipulation
import pandas as pd                      
## Change Directory in Python
import os                                
## To Split into test and train
from sklearn.model_selection import train_test_split 
## SVM
from sklearn.svm import SVC 
## Use Confusion Matrix
from sklearn.metrics import confusion_matrix 

In [83]:
##-------------------------------------
## Step 1) Load Data
##-------------------------------------
default_path = 'C:/Users/james/OneDrive/Documents/Important_Files/Favorite_5_ML algorithms'
os.chdir(default_path)
king = pd.read_csv('king.csv', index_col='id')

In [84]:
##-------------------------------------
## Step 2) Curate Data
##-------------------------------------
## 1) Make Yes/No features into 1/0 features
king['low'] = king['low'].map(dict(Yes=1, No=0))
king['firstep'] = king['firstep'].map(dict(Yes=1, No=0))
king['smoker'] = king['smoker'].map(dict(Y=1, N=0))
king['drinker'] = king['drinker'].map(dict(Y=1, N=0))

## 2) Make Gender into 1:Female and 0:Male
king['gender'] = king['gender'].map(dict(F=1, M=0))

## 3) Make Race into dummy variables and drop race feature
race_variables = pd.get_dummies(king['race'])
king = pd.concat([king,race_variables],axis = 1)
king = king.drop(columns=['race'])

## 4) Make Educational Level into dummy 
##    variables as well and drop educlv
educlv_variables = pd.get_dummies(king['educlv'])
king = pd.concat([king,educlv_variables],axis = 1)
king = king.drop(columns=['educlv'])

In [97]:
king.head()

Unnamed: 0_level_0,bwt,low,firstep,gender,age,parity,smoker,drinker,wpre,wgain,educyr,gest,asian,black,hispanic,other,white,High,Low,Middle
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,3118,0,1,1,31,4,0,0,122,22,5,40,0,0,1,0,0,0,1,0
2,3466,0,1,0,23,0,0,0,160,50,12,39,0,0,0,0,1,0,0,1
3,3147,0,1,1,24,1,0,0,150,38,13,40,0,1,0,0,0,0,0,1
4,3969,0,1,0,26,2,0,0,175,15,12,39,0,0,1,0,0,0,0,1
5,3005,0,1,0,34,0,0,0,123,17,17,40,0,0,0,0,1,1,0,0


In [94]:
##-------------------------------------
## Step 3) Implement SVM
##-------------------------------------
## Function to evaluate predictions
def imp_metrics(TP,TN,FP,FN):
    """
   Function Name: imp_metrics
   Input: TP,FN,TP,FP
   Output: Accuracy, Recal, Precision, F1
   Purpose: Find metrics to understand recovery of SVM
    
    """
    ## Calculate Accuracy
    accuracy = (TP + TN) / (TP + TN + FN + FP)
    
    ## Calculate Recall
    if(TP != 0 and FN != 0):
        recall = TP/(TP+FN)
    else:
        recall = np.nan

    ## Calculate Precision
    if(TP != 0 and FP != 0):
        precision = TP/(TP+FP)
    else:
        precision =np.nan

    ## Calculate F1
    if(precision == np.nan and recall == np.nan):
        F1 = np.nan
    else:
        F1 = 2*((precision*recall)/(precision+recall))
        
    ## Concatenate    
    metrics = np.array([accuracy,recall,precision,F1])
    return(metrics.round(2))



## 1) Split data into train/test and feature/label
## Split DF into Features and Labels
y = king['gender'] 
X = king.drop(columns=['gender'])
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=43, 
                                                    test_size = 0.26)

## 2) Implement SVM Algorithm (Expound upon these)
clf_svm = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape='ovr', degree=2, gamma='auto', kernel='rbf',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False)
## Fit the model on the training data and predict y_est_svm
y_est_svm =  clf_svm.fit(X_train, y_train).predict(X_test) 

## Determine Confusion matrix
cf_matrix = confusion_matrix(y_est_svm,y_test)

## Find True Positive & Negative and False Positive & Negative
TP = cf_matrix[0,0]
TN = cf_matrix[1,1]
FP = cf_matrix[0,1]
FN = cf_matrix[1,0] 

## Calculate Accuracy, Recal, Precision, F1
metrics = imp_metrics(TP,TN,FP,FN)
print('\n','accuracy: ',   metrics[0],'\n',
      'recall: ',     metrics[1],'\n',
      'precision: ',  metrics[2],'\n',
      'F1: ',         metrics[3])




 accuracy:  0.49 
 recall:  0.93 
 precision:  0.49 
 F1:  0.65


## Ploting SVM
http://scikit-learn.org/stable/auto_examples/svm/plot_svm_margin.html
http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html
## Data
http://courses.washington.edu/b517/Datasets/datasets.html