In [2]:
using LIBSVM
using RDatasets
using Printf
using Statistics
using Random
using Plots
using GLM
using StatsBase
using ScikitLearn
using ScikitLearn.GridSearch: RandomizedSearchCV
using JLD2
using PyCall
using CSV
stats = pyimport("scipy.stats")
imblearn = pyimport("imblearn")
@sk_import model_selection: train_test_split

@sk_import svm: SVC


[33m[1m└ [22m[39m[90m@ ScikitLearn.Skcore C:\Users\jaydh\.julia\packages\ScikitLearn\Wvn7B\src\Skcore.jl:240[39m


PyObject <class 'sklearn.svm._classes.SVC'>

In [3]:
function train_svm(df)
    X = Matrix(df[:,4:end]);
    y = df.labels
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split

    sampler(a, b) = stats.loguniform(a, b) # samples from range (a,b) according to log uniform distribution 

    # specify parameters and distributions to sample from
    param_dist = Dict("svc__C" => sampler(1e-6, 1000.0),
    "svc__gamma" => sampler(1e-6, 1000.0))

    # use SMOTE object to randomly oversample training set
    sm = imblearn.over_sampling.SMOTE(random_state=42)

    # SVM classifier
    clf = SVC()

    # add oversampling and SVM to pipeline to only oversample training set during CV
    svm_pipeline =  imblearn.pipeline.make_pipeline(sm,clf)

    # hyperparameter tuning via random search, refit to best model
    n_iter_search = 500
    random_search = RandomizedSearchCV(svm_pipeline,
                                       param_dist,
                                       n_iter=n_iter_search,
                                       random_state=MersenneTwister(41),
                                       refit=true)

    
    ScikitLearn.fit!(random_search, Xtrain, ytrain)

    # get optimal hyperparameters
    best = random_search.best_params_
    C = get(best, :svc__C, "Error")
    G = get(best, :svc__gamma, "Error")

    score = ScikitLearn.score(random_search, Xtest, ytest)

    return [score,C,G]

end

function repeat_svm(df,c,g,n_repeats)
    accuracies=[]
    for i=1:n_repeats
        df=df[shuffle(1:nrow(df)),:];
        X = Matrix(df[:,4:end]);
        y = df.labels
        Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
        clf = SVC(C=c,gamma=g)
        sm = imblearn.over_sampling.SMOTE(random_state=41)

        svm_pipeline =  imblearn.pipeline.make_pipeline(sm,clf)

        svm_pipeline.fit(Xtrain,ytrain)

        ypredict = svm_pipeline.predict(Xtest)

        acc = mean(ypredict .== ytest) * 100

        # uncomment to check that both classes are being predicted
        #println("\nypredict:", countmap(ypredict))
        #println("ytest:", countmap(ytest),"\n")

        append!(accuracies,acc)
    end
    return accuracies
end
function SVM_accuracies(df)
    X = Matrix(df[:,4:end])

    # unit normalise 

    dt = fit(UnitRangeTransform, X; dims=1, unit=true)

    Xn = StatsBase.transform(dt,X)
    for i in 1:size(X)[1]
        df[i,:][4:end] = Xn[i,:]
    end

    # train SVM

    a, C, G = train_svm(df)
    accuracies = repeat_svm(df,C,G,10)
    return accuracies
end

SVM_accuracies (generic function with 1 method)

In [4]:
df_dowker = CSV.read("dowker_combined.csv");
df_VR = CSV.read("vr_combined.csv");

[33m[1m│ [22m[39m  caller = read(source::String) at CSV.jl:46
[33m[1m└ [22m[39m[90m@ CSV C:\Users\jaydh\.julia\packages\CSV\UIgP3\src\CSV.jl:46[39m


In [7]:
accuracies = SVM_accuracies(df_dowker);
#CSV.write("dowker_accuracy.csv",DataFrame(vector=accuracies));

In [6]:
accuracies = SVM_accuracies(df_VR);
#CSV.write("vr_accuracy.csv",DataFrame(vector=accuracies));