In [40]:
include("compute_ph2.jl")
using LIBSVM
using RDatasets
using Printf
using Statistics
using Random
using .ext
using Plots
using GLM
using StatsBase
using ScikitLearn
using JLD2
@sk_import model_selection: train_test_split
workdir = pwd()

train_ratio = 0.6
validation_ratio = 0.2
test_ratio = 0.2

└ @ ScikitLearn.Skcore C:\Users\jaydh\.julia\packages\ScikitLearn\ssekP\src\Skcore.jl:179


0.2

In [41]:
function random_undersample(df)
    class_0 = countmap(df.labels)[0];
    class_1 = countmap(df.labels)[1];

    Random.seed!(1);
    df=df[shuffle(1:nrow(df)),:];   

    if class_0 > class_1 
        # randomly undersample class 0

        prob_to_delete = (class_0-class_1)/(class_0);
        rows_to_delete = [];
        for row in 1:length(df.labels)
            if df[row,:labels] == 0
                if rand() < prob_to_delete
                    append!(rows_to_delete,row);
                end
            end
        end
        delete!(df,rows_to_delete);
    elseif class_1 > class_0 
        # randomly undersample class 1

        prob_to_delete = (class_1-class_0)/(class_1);
        rows_to_delete = [];
        for row in 1:length(df.labels)
            if df[row,:labels] == 1
                if rand() < prob_to_delete
                    append!(rows_to_delete,row);
                end
            end
        end
        delete!(df,rows_to_delete);
    end
    return df
end

function parameter_optimisation(Xtrain,ytrain,Xval,yval, costs, gammas)
    Cs = [] 
    Gs = []
    maxi = 0
    for i=1:1:7
        G = gammas[i]
        for j=1:1:7
            C = costs[j]
            model = svmtrain(Xtrain', ytrain ; gamma=G , cost=C)
            ypredict, decision_values = svmpredict(model, Xval');
            acc = mean(ypredict .== yval) * 100;
            if acc > maxi 
                #println("current max: ",acc )
                maxi = acc
                append!(Cs,C)
                append!(Gs,G)
            end
        end
    end
    return (Cs[end],Gs[end])
end

parameter_optimisation (generic function with 2 methods)

In [28]:
# hyperparameter ranges
gammas = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0];
costs = gammas;

VR H0 Tumour Cells

In [81]:
# load features into dataframe
df_vr_tumour_h0 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:20
    colname = string("feature_",i)
    df_vr_tumour_h0[!,colname] = Float64[]
end
location_tumour_h0 = "persistence_images\\vr\\tumour_h0\\"
files_vr_tumour_h0 = readdir(workdir*"\\"*location_tumour_h0)
for f in files_vr_tumour_h0
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_tumour_h0*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_vr_tumour_h0,row)
end

# rescale features to [0,1]
X = Matrix(df_vr_tumour_h0[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_vr_tumour_h0[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_vr_tumour_h0 = random_undersample(df_vr_tumour_h0)

# randomly sample training, test and validation sets
df_vr_tumour_h0=df_vr_tumour_h0[shuffle(1:nrow(df_vr_tumour_h0)),:];   
X = Matrix(df_vr_tumour_h0[:,4:end]);    
y = df_vr_tumour_h0.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=1-train_ratio); # train-test split
Xval, Xtest, yval, ytest = train_test_split(Xtest,ytest,test_size= (test_ratio)/(test_ratio+validation_ratio)); # test-validation split

# optimise hyperparameters
C,G = parameter_optimisation(Xtrain,ytrain,Xval,yval)

# report final accuracy on test set

model = svmtrain(Xtrain', ytrain ; gamma=G , cost=C)
ypredict, decision_values = svmpredict(model, Xtest');
acc = mean(ypredict .== ytest) * 100;
println("accuracy: ", acc)
println("Cost: ", C, " Gamma: ", G)


accuracy: 79.43925233644859
Cost: 100.0 Gamma: 100.0


VR H1 Tumour cells

In [82]:
# load features into dataframe
df_vr_tumour_h1 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_vr_tumour_h1[!,colname] = Float64[]
end
location_tumour_h1 = "persistence_images\\vr\\tumour_h1\\"
files_vr_tumour_h1 = readdir(workdir*"\\"*location_tumour_h1)
for f in files_vr_tumour_h1
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_tumour_h1*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_vr_tumour_h1,row)
end

# rescale features to [0,1]
X = Matrix(df_vr_tumour_h1[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_vr_tumour_h1[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_vr_tumour_h1 = random_undersample(df_vr_tumour_h1)

# randomly sample training, test and validation sets
df_vr_tumour_h1=df_vr_tumour_h1[shuffle(1:nrow(df_vr_tumour_h1)),:];   
X = Matrix(df_vr_tumour_h1[:,4:end]);    
y = df_vr_tumour_h1.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=1-train_ratio); # train-test split
Xval, Xtest, yval, ytest = train_test_split(Xtest,ytest,test_size= (test_ratio)/(test_ratio+validation_ratio)); # test-validation split

# optimise hyperparameters
C,G = parameter_optimisation(Xtrain,ytrain,Xval,yval)

# report final accuracy on test set

model = svmtrain(Xtrain', ytrain ; gamma=G , cost=C)
ypredict, decision_values = svmpredict(model, Xtest');
acc = mean(ypredict .== ytest) * 100;
println("accuracy: ", acc)
println("Cost: ", C, " Gamma: ", G)


accuracy: 84.11214953271028
Cost: 10.0 Gamma: 1.0
