In [2]:
using LIBSVM
using RDatasets
using Printf
using Statistics
using Random
using Plots
using GLM
using StatsBase
using ScikitLearn
using ScikitLearn.GridSearch: RandomizedSearchCV
using JLD2
using PyCall
using CSV
stats = pyimport("scipy.stats")
imblearn = pyimport("imblearn")
@sk_import model_selection: train_test_split

@sk_import svm: SVC


[33m[1m└ [22m[39m[90m@ ScikitLearn.Skcore C:\Users\jaydh\.julia\packages\ScikitLearn\Wvn7B\src\Skcore.jl:240[39m


PyObject <class 'sklearn.svm._classes.SVC'>

In [3]:
function train_svm(df)
    X = Matrix(df[:,4:end]);
    y = df.labels
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split

    sampler(a, b) = stats.loguniform(a, b) # samples from range (a,b) according to log uniform distribution 

    # specify parameters and distributions to sample from
    param_dist = Dict("svc__C" => sampler(1e-6, 1000.0),
    "svc__gamma" => sampler(1e-6, 1000.0))

    # use SMOTE object to randomly oversample training set
    sm = imblearn.over_sampling.SMOTE(random_state=42)

    # SVM classifier
    clf = SVC()

    # add oversampling and SVM to pipeline to only oversample training set during CV
    svm_pipeline =  imblearn.pipeline.make_pipeline(sm,clf)

    # hyperparameter tuning via random search, refit to best model
    n_iter_search = 100
    random_search = RandomizedSearchCV(svm_pipeline,
                                       param_dist,
                                       n_iter=n_iter_search,
                                       random_state=MersenneTwister(41),
                                       refit=true)

    
    ScikitLearn.fit!(random_search, Xtrain, ytrain)

    # get optimal hyperparameters
    best = random_search.best_params_
    C = get(best, :svc__C, "Error")
    G = get(best, :svc__gamma, "Error")

    score = ScikitLearn.score(random_search, Xtest, ytest)

    return [score,C,G]

end

function repeat_svm(df,c,g,n_repeats)
    accuracies=[]
    for i=1:n_repeats
        df=df[shuffle(1:nrow(df)),:];
        X = Matrix(df[:,4:end]);
        y = df.labels
        Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
        clf = SVC(C=c,gamma=g)
        sm = imblearn.over_sampling.SMOTE(random_state=41)

        svm_pipeline =  imblearn.pipeline.make_pipeline(sm,clf)

        svm_pipeline.fit(Xtrain,ytrain)

        ypredict = svm_pipeline.predict(Xtest)

        acc = mean(ypredict .== ytest) * 100

        # uncomment to check that both classes are being predicted
        #println("\nypredict:", countmap(ypredict))
        #println("ytest:", countmap(ytest),"\n")

        append!(accuracies,acc)
    end
    return accuracies
end

repeat_svm (generic function with 1 method)

In [4]:
function populatePHdata(imagepath, size)
    df = DataFrame(ids=Int64[],times=Int64[],labels=[])
    for i in 1:size
        colname = string("feature_",i)
        df[!,colname] = Float64[]
    end
    for i in 1:972
        id = simulation_df[i,"id"]
        time = simulation_df[i,"time"]
        label = simulation_df[i,"label"]
        imagename = string(id)*"_"*string(time)*"_img.jld2"
        V = load_object(imagepath*imagename)
        row = Any[]
        append!(row,id)
        append!(row,time)
        append!(row,label)
        for x in V
            append!(row,x)
        end
        push!(df,row)
    end
    return df

end

populatePHdata (generic function with 1 method)

In [5]:
function SVM_accuracies(df)
    X = Matrix(df[:,4:end])

    dt = fit(UnitRangeTransform, X; dims=1, unit=true)

    Xn = StatsBase.transform(dt,X)
    for i in 1:size(X)[1]
        df[i,:][4:end] = Xn[i,:]
    end

    a, C, G = train_svm(df)
    accuracies = repeat_svm(df,C,G,10)
    return accuracies
end

SVM_accuracies (generic function with 1 method)

In [6]:
simulation_df = CSV.read("labels.csv", DataFrame);


In [7]:
df_vr_tumour_h0 = populatePHdata("persistence_images\\vr\\tumour_h0\\",20);

accuracies = SVM_accuracies(df_vr_tumour_h0);

#CSV.write("vr_tumour_h0_accuracies.csv", DataFrame(vector=accuracies));


In [8]:
df_vr_tumour_h1 = populatePHdata("persistence_images\\vr\\tumour_h1\\",400);

accuracies = SVM_accuracies(df_vr_tumour_h1);

#CSV.write("vr_tumour_h1_accuracies.csv", DataFrame(vector=accuracies));


In [9]:
df_vr_macrophage_h0 = populatePHdata("persistence_images\\vr\\macrophages_h0\\",20);

accuracies = SVM_accuracies(df_vr_macrophage_h0);

#CSV.write("vr_macrophage_h0_accuracies.csv", DataFrame(vector=accuracies));


In [10]:
df_vr_macrophage_h1 = populatePHdata("persistence_images\\vr\\macrophages_h1\\",400);

accuracies = SVM_accuracies(df_vr_macrophage_h1);

#CSV.write("vr_macrophage_h1_accuracies.csv", DataFrame(vector=accuracies));


In [11]:
df_dowker_mt_h0 = populatePHdata("persistence_images\\dowker\\macrophage_tumour_h0\\",400);

accuracies = SVM_accuracies(df_dowker_mt_h0);

#CSV.write("dowker_mt_h0_accuracies.csv", DataFrame(vector=accuracies));


In [12]:
df_dowker_mt_h1 = populatePHdata("persistence_images\\dowker\\macrophage_tumour_h1\\",400);

accuracies = SVM_accuracies(df_dowker_mt_h1);

#CSV.write("dowker_mt_h1_accuracies.csv", DataFrame(vector=accuracies));


In [13]:
df_dowker_mv_h0 = populatePHdata("persistence_images\\dowker\\macrophage_vessel_h0\\",400);

accuracies = SVM_accuracies(df_dowker_mv_h0);

#CSV.write("dowker_mv_h0_accuracies.csv", DataFrame(vector=accuracies));


In [14]:
df_dowker_mv_h1 = populatePHdata("persistence_images\\dowker\\macrophage_vessel_h1\\",400);

accuracies = SVM_accuracies(df_dowker_mv_h1);

#CSV.write("dowker_mv_h1_accuracies.csv", DataFrame(vector=accuracies));


In [15]:
df_dowker_tv_h0 = populatePHdata("persistence_images\\dowker\\tumour_vessel_h0\\",400);

accuracies = SVM_accuracies(df_dowker_tv_h0);

#CSV.write("dowker_tv_h0_accuracies.csv", DataFrame(vector=accuracies));


In [16]:
df_dowker_tv_h1 = populatePHdata("persistence_images\\dowker\\tumour_vessel_h1\\",400);

accuracies = SVM_accuracies(df_dowker_tv_h1);

#CSV.write("dowker_tv_h1_accuracies.csv", DataFrame(vector=accuracies));


In [17]:
non_top_df = CSV.read("non_toological_data.csv", DataFrame);

for col in names(non_top_df[:,4:end])
    if (typeof(non_top_df[!,col][1]) == Int64)
        non_top_df[!,col] = Float64.(non_top_df[!,col])
    end
end

accuracies = SVM_accuracies(non_top_df);

#CSV.write("simple_accuracy.csv", DataFrame(vector=accuracies));