In [1]:
import time
import scipy

import numpy as np 
import pandas as pd

from sklearn import svm 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [2]:
# Load the df_working pickle object for use in a Random Forest
df = pd.read_pickle("df_one_hot.pkl")

In [4]:
# Define the model features
X = df.iloc[:,20:]

# Define the target variable 
y = df["income"]

# Instantiate a random forest model 
svc  = svm.SVC()

# Set a timer to see how long the model takes to run
start = time.time()

# Run the model and score
cross_val = cross_val_score(svc, X, y, cv=5)

# Determine the amount of time it takes to  run the model and score
runtime = f"{(time.time() - start):0.2f}"

# Print the cross validation results
print(cross_val)

# Print the run time
print(f"\nModel run time: {runtime} secs.")

[0.82972517 0.82862408 0.83584152 0.84121622 0.83660934]

Model run time: 580.06 secs.


In [9]:
# Support Vector Machine Classifer train and test
def rf_train_and_test(X,y,splits):
    # Instantiate the model and store in a variable 
    svc = svm.SVC()

    # Set the number of folds (training and test samples)
    kf = KFold(n_splits=splits)

    # Create a parent list to hold test and train scores
    score_lists = list()

    # Train and test using KFolds
    for i, (train_index, test_index) in enumerate(kf.split(X), start=1):
        # Create the train samples for the fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]

        # Create the test samples for the fold
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Locate the appropriate rows in the target series
        train_ref = y.iloc[train_index]

        # Set a timer to see how long fitting took
        start = time.time()
        
        # Train the model using the train and test data
        y_pred_train = svc.fit(X_train,y_train).predict(X_train)

        # Check to see how long the fitting took
        fit_time = f"{(time.time() - start):0.2f}"

        # Convert the confusion matrix to a list for reporting output
        scores = list(metrics.confusion_matrix(y_train, y_pred_train).ravel()) 

        # Add the fold iteration to the scores list
        scores.insert(0,i)

        # Score the folds accuracy
        accuracy = metrics.accuracy_score(y_train, y_pred_train)

        # Append the accuracy to the scores list
        scores.append(accuracy)

        # Append the fit time to the scores list
        scores.append(fit_time)

        # Append the scores to the score_lists parent
        score_lists.append(scores)

        # Precision, recall and f1-score
        precision = pd.DataFrame.from_dict(
            metrics.classification_report(y_train, y_pred_train,output_dict=True)
        )

    # Create a dataframe of training and test results
    df_out = pd.DataFrame(score_lists, columns=["fold", "true_negative", "false_positive", "false_negative", "true_positive", "accuracy", "fit_time (secs.)"])

    # List of output objects
    resluts_output = [df_out,precision]

    return resluts_output

# Model features
X = df.iloc[:, 20:]

# Model target
y = df["income"]

# Define the number groups the data will be split into
splits = 5

# Get the report dataframes from the function return
rf_results, rf_precision = rf_train_and_test(X,y,splits)

# Show the results dataframe
display(rf_results)

# Show the second results dataframe
display(rf_precision)

Unnamed: 0,fold,true_negative,false_positive,false_negative,true_positive,accuracy,fit_time (secs.)
0,1,18575,1204,2857,3412,0.844096,180.7
1,2,18460,1290,2761,3538,0.844485,153.32
2,3,18562,1195,2867,3425,0.844063,144.16
3,4,18564,1222,2918,3345,0.841069,144.73
4,5,18605,1203,2881,3360,0.843219,138.35


Unnamed: 0,<=50K,>50K,accuracy,macro avg,weighted avg
precision,0.865913,0.736358,0.843219,0.801135,0.834873
recall,0.939267,0.538375,0.843219,0.738821,0.843219
f1-score,0.901099,0.621992,0.843219,0.761546,0.834229
support,19808.0,6241.0,0.843219,26049.0,26049.0
