In [98]:
include("compute_ph2.jl")
using LIBSVM
using RDatasets
using Printf
using Statistics
using Random
using .ext
using Plots
using GLM
using StatsBase
using ScikitLearn
using ScikitLearn.GridSearch: RandomizedSearchCV
using JLD2
using PyCall
stats = pyimport("scipy.stats")
@sk_import model_selection: train_test_split
@sk_import svm: SVC
workdir = pwd()

train_ratio = 0.6
validation_ratio = 0.2
test_ratio = 0.2

└ @ ScikitLearn.Skcore C:\Users\jaydh\.julia\packages\ScikitLearn\ssekP\src\Skcore.jl:179


0.2

In [104]:
function random_oversample(df)
    class_0 = countmap(df.labels)[0];
    class_1 = countmap(df.labels)[1];
    Random.seed!(1);
    df=df[shuffle(1:nrow(df)),:]; 
    # EMPTY COPY OF df
    duplicated_rows = copy(df)
    delete!(duplicated_rows,1:length(df.labels))
    if class_0 > class_1
        # class_0 larger -> duplicate class_1
        # need to add class_0-class_1 duplicates -> duplicate a given class 1 row with probability class_0-class_1/class_1
        prob_to_add = (class_0-class_1)/(class_1);
        for row in eachrow(df)
            if row.labels == 1
                if rand() < prob_to_add
                    append!(duplicated_rows,DataFrame(row))
                end
            end
        end
        append!(df,duplicated_rows)
    elseif class_1 > class_0 
        prob_to_add = (class_1-class_0)/(class_0);
        for row in eachrow(df)
            if row.labels == 0
                if rand() < prob_to_add
                    append!(duplicated_rows,DataFrame(row))
                end
            end
        end
        append!(df,duplicated_rows)
    end
    return df
end

random_oversample (generic function with 1 method)

In [105]:
function random_undersample(df)
    class_0 = countmap(df.labels)[0];
    class_1 = countmap(df.labels)[1];

    Random.seed!(1);
    df=df[shuffle(1:nrow(df)),:];   

    if class_0 > class_1 
        # randomly undersample class 0

        prob_to_delete = (class_0-class_1)/(class_0);
        rows_to_delete = [];
        for row in 1:length(df.labels)
            if df[row,:labels] == 0
                if rand() < prob_to_delete
                    append!(rows_to_delete,row);
                end
            end
        end
        delete!(df,rows_to_delete);
    elseif class_1 > class_0 
        # randomly undersample class 1

        prob_to_delete = (class_1-class_0)/(class_1);
        rows_to_delete = [];
        for row in 1:length(df.labels)
            if df[row,:labels] == 1
                if rand() < prob_to_delete
                    append!(rows_to_delete,row);
                end
            end
        end
        delete!(df,rows_to_delete);
    end
    return df
end

function parameter_optimisation(Xtrain,ytrain,Xval,yval, costs, gammas)
    Cs = [] 
    Gs = []
    accs = zeros(7,7)
    maxi = 0
    for i=1:1:7
        G = gammas[i]
        for j=1:1:7
            C = costs[j]
            model = svmtrain(Xtrain', ytrain ; gamma=G , cost=C)
            ypredict, decision_values = svmpredict(model, Xval');
            acc = mean(ypredict .== yval) * 100;
            accs[i,j] = acc
            if acc > maxi 
                #println("current max: ",acc )
                maxi = acc
                append!(Cs,C)
                append!(Gs,G)
            end
        end
    end

    c = [0.0001 0.001 0.01 0.1 1.0 10.0 100.0]
    g = ["Costs->";0.0001;0.001;0.01;0.1;1.0;10.0;100.0]
    accs2=vcat(c,accs)
    accs3=hcat(g,accs2)
    return (Cs[end],Gs[end],accs3)
end
function train_svm(Xtrain,ytrain,Xtest,ytest)
    sampler(a, b) = stats.loguniform(a, b) # samples from range (a,b) according to log uniform distribution 

    # specify parameters and distributions to sample from
    param_dist = Dict("C" => sampler(1e-6, 1000.0),
                    "gamma" => sampler(1e-6, 1000.0))

    # define classifier
    clf = SVC()

    # Do random parameter search
    n_iter_search = 100
    random_search = RandomizedSearchCV(clf, 
                                        param_dist,
                                    n_iter=n_iter_search, 
                                        random_state=MersenneTwister(42),
                                        refit=true)

    #start = time()
    ScikitLearn.fit!(random_search, Xtrain, ytrain)
    #@printf("RandomizedSearchCV took %.2f seconds for %d candidates, parameter settings.\n", (time() - start), n_iter_search)
    #display(random_search.grid_scores_)
    best = random_search.best_params_
    C = get(best, :C, "Error")
    G = get(best, :gamma, "Error")
    return [ScikitLearn.score(random_search, Xtest, ytest),C,G]
end

train_svm (generic function with 1 method)

VR H0 Tumour Cells

In [106]:
# load features into dataframe
df_vr_tumour_h0 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:20
    colname = string("feature_",i)
    df_vr_tumour_h0[!,colname] = Float64[]
end
location_tumour_h0 = "persistence_images\\vr\\tumour_h0\\"
files_vr_tumour_h0 = readdir(workdir*"\\"*location_tumour_h0)
for f in files_vr_tumour_h0
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_tumour_h0*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_vr_tumour_h0,row)
end

# rescale features to [0,1]
X = Matrix(df_vr_tumour_h0[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_vr_tumour_h0[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_vr_tumour_h0 = random_oversample(df_vr_tumour_h0)
df_vr_tumour_h0 = random_oversample(df_vr_tumour_h0)

# randomly sample training, test and validation sets
df_vr_tumour_h0=df_vr_tumour_h0[shuffle(1:nrow(df_vr_tumour_h0)),:];   
X = Matrix(df_vr_tumour_h0[:,4:end]);    
y = df_vr_tumour_h0.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 85.86065573770492
Cost: 355.74116142592516
Gamma: 236.59964959443886


VR H1 Tumour cells

In [107]:
# load features into dataframe
df_vr_tumour_h1 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_vr_tumour_h1[!,colname] = Float64[]
end
location_tumour_h1 = "persistence_images\\vr\\tumour_h1\\"
files_vr_tumour_h1 = readdir(workdir*"\\"*location_tumour_h1)
for f in files_vr_tumour_h1
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_tumour_h1*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_vr_tumour_h1,row)
end

# rescale features to [0,1]
X = Matrix(df_vr_tumour_h1[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_vr_tumour_h1[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_vr_tumour_h1 = random_oversample(df_vr_tumour_h1)
df_vr_tumour_h1 = random_oversample(df_vr_tumour_h1)

# randomly sample training, test and validation sets
df_vr_tumour_h1=df_vr_tumour_h1[shuffle(1:nrow(df_vr_tumour_h1)),:];   
X = Matrix(df_vr_tumour_h1[:,4:end]);    
y = df_vr_tumour_h1.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 92.00819672131148
Cost: 4.716105360449407
Gamma: 478.99066203695105


VR H0 Macrophages

In [108]:
# load features into dataframe
df_vr_macrophage_h0 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:20
    colname = string("feature_",i)
    df_vr_macrophage_h0[!,colname] = Float64[]
end
location_macrophage_h0 = "persistence_images\\vr\\macrophages_h0\\"
files_vr_macrophage_h0 = readdir(workdir*"\\"*location_macrophage_h0)
for f in files_vr_macrophage_h0
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_macrophage_h0*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_vr_macrophage_h0,row)
end

# rescale features to [0,1]
X = Matrix(df_vr_macrophage_h0[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_vr_macrophage_h0[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_vr_macrophage_h0 = random_oversample(df_vr_macrophage_h0)
df_vr_macrophage_h0 = random_oversample(df_vr_macrophage_h0)

# randomly sample training, test and validation sets
df_vr_macrophage_h0=df_vr_macrophage_h0[shuffle(1:nrow(df_vr_macrophage_h0)),:];   
X = Matrix(df_vr_macrophage_h0[:,4:end]);    
y = df_vr_macrophage_h0.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 92.60780287474333
Cost: 355.74116142592516
Gamma: 236.59964959443886


VR H1 Macrophages

In [110]:
# load features into dataframe
df_vr_macrophage_h1 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_vr_macrophage_h1[!,colname] = Float64[]
end
location_macrophage_h1 = "persistence_images\\vr\\macrophages_h1\\"
files_vr_macrophage_h1 = readdir(workdir*"\\"*location_macrophage_h1)
for f in files_vr_macrophage_h1
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_macrophage_h1*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_vr_macrophage_h1,row)
end

# rescale features to [0,1]
X = Matrix(df_vr_macrophage_h1[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_vr_macrophage_h1[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_vr_macrophage_h1 = random_oversample(df_vr_macrophage_h1)
df_vr_macrophage_h1 = random_oversample(df_vr_macrophage_h1)

# randomly sample training, test and validation sets
df_vr_macrophage_h1=df_vr_macrophage_h1[shuffle(1:nrow(df_vr_macrophage_h1)),:];   
X = Matrix(df_vr_macrophage_h1[:,4:end]);    
y = df_vr_macrophage_h1.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 88.50102669404517
Cost: 4.631969129570603
Gamma: 61.98272266930091


Dowker H0 macrophage-tumour

In [112]:
# load features into dataframe
df_dowker_mt_h0 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_dowker_mt_h0[!,colname] = Float64[]
end
location_mt_h0 = "persistence_images\\dowker\\macrophage_tumour_h0\\"
files_dowker_mt_h0 = readdir(workdir*"\\"*location_mt_h0)
for f in files_dowker_mt_h0
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_mt_h0*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_dowker_mt_h0,row)
end

# rescale features to [0,1]
X = Matrix(df_dowker_mt_h0[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_dowker_mt_h0[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_dowker_mt_h0 = random_oversample(df_dowker_mt_h0)
df_dowker_mt_h0 = random_oversample(df_dowker_mt_h0)

# randomly sample training, test and validation sets
df_dowker_mt_h0=df_dowker_mt_h0[shuffle(1:nrow(df_dowker_mt_h0)),:];   
X = Matrix(df_dowker_mt_h0[:,4:end]);    
y = df_dowker_mt_h0.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)


Accuracy: 97.3305954825462
Cost: 4.716105360449407
Gamma: 478.99066203695105


Dowker H1 macrophage-tumour

In [113]:
# load features into dataframe
df_dowker_mt_h1 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_dowker_mt_h1[!,colname] = Float64[]
end
location_mt_h1 = "persistence_images\\dowker\\macrophage_tumour_h1\\"
files_dowker_mt_h1 = readdir(workdir*"\\"*location_mt_h1)
for f in files_dowker_mt_h1
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_mt_h1*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_dowker_mt_h1,row)
end

# rescale features to [0,1]
X = Matrix(df_dowker_mt_h1[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_dowker_mt_h1[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_dowker_mt_h1 = random_oversample(df_dowker_mt_h1)
df_dowker_mt_h1 = random_oversample(df_dowker_mt_h1)

# randomly sample training, test and validation sets
df_dowker_mt_h1=df_dowker_mt_h1[shuffle(1:nrow(df_dowker_mt_h1)),:];   
X = Matrix(df_dowker_mt_h1[:,4:end]);    
y = df_dowker_mt_h1.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 95.27720739219713
Cost: 4.716105360449407
Gamma: 478.99066203695105


Dowker H0 Tumour-vessel

In [114]:
# load features into dataframe
df_dowker_tv_h0 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_dowker_tv_h0[!,colname] = Float64[]
end
location_tv_h0 = "persistence_images\\dowker\\tumour_vessel_h0\\"
files_dowker_tv_h0 = readdir(workdir*"\\"*location_tv_h0)
for f in files_dowker_tv_h0
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_tv_h0*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_dowker_tv_h0,row)
end

# rescale features to [0,1]
X = Matrix(df_dowker_tv_h0[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_dowker_tv_h0[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_dowker_tv_h0 = random_oversample(df_dowker_tv_h0)
df_dowker_tv_h0 = random_oversample(df_dowker_tv_h0)

# randomly sample training, test and validation sets
df_dowker_tv_h0=df_dowker_tv_h0[shuffle(1:nrow(df_dowker_tv_h0)),:];   
X = Matrix(df_dowker_tv_h0[:,4:end]);    
y = df_dowker_tv_h0.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 96.30390143737166
Cost: 4.716105360449407
Gamma: 478.99066203695105


Dowker H1 Tumour-vessel

In [115]:
# load features into dataframe
df_dowker_tv_h1 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_dowker_tv_h1[!,colname] = Float64[]
end
location_tv_h1 = "persistence_images\\dowker\\tumour_vessel_h1\\"
files_dowker_tv_h1 = readdir(workdir*"\\"*location_tv_h1)
for f in files_dowker_tv_h1
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_tv_h1*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_dowker_tv_h1,row)
end

# rescale features to [0,1]
X = Matrix(df_dowker_tv_h1[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_dowker_tv_h1[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_dowker_tv_h1 = random_oversample(df_dowker_tv_h1)
df_dowker_tv_h1 = random_oversample(df_dowker_tv_h1)

# randomly sample training, test and validation sets
df_dowker_tv_h1=df_dowker_tv_h1[shuffle(1:nrow(df_dowker_tv_h1)),:];   
X = Matrix(df_dowker_tv_h1[:,4:end]);    
y = df_dowker_tv_h1.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 93.63449691991786
Cost: 4.716105360449407
Gamma: 478.99066203695105


Dowker H0 Macrophage-vessel

In [116]:
# load features into dataframe
df_dowker_mv_h0 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_dowker_mv_h0[!,colname] = Float64[]
end
location_mv_h0 = "persistence_images\\dowker\\macrophage_vessel_h0\\"
files_dowker_mv_h0 = readdir(workdir*"\\"*location_mv_h0)
for f in files_dowker_mv_h0
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_mv_h0*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_dowker_mv_h0,row)
end

# rescale features to [0,1]
X = Matrix(df_dowker_mv_h0[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_dowker_mv_h0[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_dowker_mv_h0 = random_oversample(df_dowker_mv_h0)
df_dowker_mv_h0 = random_oversample(df_dowker_mv_h0)

# randomly sample training, test and validation sets
df_dowker_mv_h0=df_dowker_mv_h0[shuffle(1:nrow(df_dowker_mv_h0)),:];   
X = Matrix(df_dowker_mv_h0[:,4:end]);    
y = df_dowker_mv_h0.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 92.81314168377823
Cost: 302.8199194537099
Gamma: 1.4829712322553499


Dowker H1 Macrophage-vessel

In [117]:
# load features into dataframe
df_dowker_mv_h1 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_dowker_mv_h1[!,colname] = Float64[]
end
location_mv_h1 = "persistence_images\\dowker\\macrophage_vessel_h1\\"
files_dowker_mv_h1 = readdir(workdir*"\\"*location_mv_h1)
for f in files_dowker_mv_h1
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_mv_h1*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_dowker_mv_h1,row)
end

# rescale features to [0,1]
X = Matrix(df_dowker_mv_h1[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_dowker_mv_h1[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_dowker_mv_h1 = random_oversample(df_dowker_mv_h1)
df_dowker_mv_h1 = random_oversample(df_dowker_mv_h1)

# randomly sample training, test and validation sets
df_dowker_mv_h1=df_dowker_mv_h1[shuffle(1:nrow(df_dowker_mv_h1)),:];   
X = Matrix(df_dowker_mv_h1[:,4:end]);    
y = df_dowker_mv_h1.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 87.88501026694045
Cost: 355.74116142592516
Gamma: 236.59964959443886


Dowker-time Macrophages H0

In [118]:
# load features into dataframe
df_dowkertime_m_h0 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_dowkertime_m_h0[!,colname] = Float64[]
end
location_m_time_h0 = "persistence_images\\dowkertime\\macrophage_time_h0\\"
files_dowker_m_time_h0 = readdir(workdir*"\\"*location_m_time_h0)
for f in files_dowker_m_time_h0
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_m_time_h0*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_dowkertime_m_h0,row)
end

# rescale features to [0,1]
X = Matrix(df_dowkertime_m_h0[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_dowkertime_m_h0[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_dowkertime_m_h0 = random_oversample(df_dowkertime_m_h0)
df_dowkertime_m_h0 = random_oversample(df_dowkertime_m_h0)

# randomly sample training, test and validation sets
df_dowkertime_m_h0=df_dowkertime_m_h0[shuffle(1:nrow(df_dowkertime_m_h0)),:];   
X = Matrix(df_dowkertime_m_h0[:,4:end]);    
y = df_dowkertime_m_h0.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 90.75975359342917
Cost: 70.9615991894713
Gamma: 4.037568747604432


Dowker-time Macrophages H1

In [119]:
# load features into dataframe
df_dowkertime_m_h1 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_dowkertime_m_h1[!,colname] = Float64[]
end
location_m_time_h1 = "persistence_images\\dowkertime\\macrophage_time_h1\\"
files_dowker_m_time_h1 = readdir(workdir*"\\"*location_m_time_h1)
for f in files_dowker_m_time_h1
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_m_time_h1*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_dowkertime_m_h1,row)
end

# rescale features to [0,1]
X = Matrix(df_dowkertime_m_h1[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_dowkertime_m_h1[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_dowkertime_m_h1 = random_oversample(df_dowkertime_m_h1)
df_dowkertime_m_h1 = random_oversample(df_dowkertime_m_h1)


# randomly sample training, test and validation sets
df_dowkertime_m_h1=df_dowkertime_m_h1[shuffle(1:nrow(df_dowkertime_m_h1)),:];   
X = Matrix(df_dowkertime_m_h1[:,4:end]);    
y = df_dowkertime_m_h1.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 87.67967145790554
Cost: 4.716105360449407
Gamma: 478.99066203695105


Dowker-time Tumour H0

In [120]:
# load features into dataframe
df_dowkertime_t_h0 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_dowkertime_t_h0[!,colname] = Float64[]
end
location_t_time_h0 = "persistence_images\\dowkertime\\tumour_time_h0\\"
files_dowker_t_time_h0 = readdir(workdir*"\\"*location_t_time_h0)
for f in files_dowker_t_time_h0
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_t_time_h0*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_dowkertime_t_h0,row)
end

# rescale features to [0,1]
X = Matrix(df_dowkertime_t_h0[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_dowkertime_t_h0[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_dowkertime_t_h0 = random_oversample(df_dowkertime_t_h0)
df_dowkertime_t_h0 = random_oversample(df_dowkertime_t_h0)

# randomly sample training, test and validation sets
df_dowkertime_t_h0=df_dowkertime_t_h0[shuffle(1:nrow(df_dowkertime_t_h0)),:];   
X = Matrix(df_dowkertime_t_h0[:,4:end]);    
y = df_dowkertime_t_h0.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 91.17043121149896
Cost: 355.74116142592516
Gamma: 236.59964959443886


Dowker-time Tumour H1

In [121]:
# load features into dataframe
df_dowkertime_t_h1 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:400
    colname = string("feature_",i)
    df_dowkertime_t_h1[!,colname] = Float64[]
end
location_t_time_h1 = "persistence_images\\dowkertime\\tumour_time_h1\\"
files_dowker_t_time_h1 = readdir(workdir*"\\"*location_t_time_h1)
for f in files_dowker_t_time_h1
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_t_time_h1*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_dowkertime_t_h1,row)
end

# rescale features to [0,1]
X = Matrix(df_dowkertime_t_h1[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_dowkertime_t_h1[i,:][4:end] = Xn[i,:]
end

# randomly undersample imbalanced data 
df_dowkertime_t_h1 = random_oversample(df_dowkertime_t_h1)
df_dowkertime_t_h1 = random_oversample(df_dowkertime_t_h1)

# randomly sample training, test and validation sets
df_dowkertime_t_h1=df_dowkertime_t_h1[shuffle(1:nrow(df_dowkertime_t_h1)),:];   
X = Matrix(df_dowkertime_t_h1[:,4:end]);    
y = df_dowkertime_t_h1.labels;
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3); # train-test split
accuracy,C,G = train_svm(Xtrain,ytrain,Xtest,ytest)
println("Accuracy: ", accuracy*100)
println("Cost: ", C)
println("Gamma: ", G)

Accuracy: 87.26899383983573
Cost: 70.9615991894713
Gamma: 4.037568747604432


# Example hyperparameter tuning

In [6]:
using ScikitLearn.GridSearch: RandomizedSearchCV
using PyCall
stats = pyimport("scipy.stats")

using ScikitLearn
@sk_import svm: SVC

PyObject <class 'sklearn.svm._classes.SVC'>

<b>To Jay: The following is copy-pasted from your code </b>

In [91]:
# load features into dataframe
df_vr_tumour_h0 = DataFrame(ids=Int64[],times=Int64[],labels=[])
for i in 1:20
    colname = string("feature_",i)
    df_vr_tumour_h0[!,colname] = Float64[]
end
location_tumour_h0 = "persistence_images/vr/tumour_h0/"
files_vr_tumour_h0 = readdir(workdir*"/"*location_tumour_h0)
for f in files_vr_tumour_h0
    id,time = parse_file(f)
    id = parse(Int64,id)
    time = parse(Int64,time)
    image = load_object(location_tumour_h0*f)
    row = Any[]
    label = getBinaryLabel(id,time)
    append!(row,id)
    append!(row,time)
    append!(row,label)
    for x in image
        append!(row,x)
    end
    push!(df_vr_tumour_h0,row)
end

# rescale features to [0,1]
X = Matrix(df_vr_tumour_h0[:,4:end])

dt = fit(UnitRangeTransform, X; dims=1, unit=true)

Xn = StatsBase.transform(dt,X)
for i in 1:size(X)[1]
    df_vr_tumour_h0[i,:][4:end] = Xn[i,:]
end


<b> To Jay: all the labels appear as 0, so I get an error when I run `random_undersample` </b>

In [97]:
countmap(df_vr_tumour_h0.labels)

Dict{Any, Int64} with 2 entries:
  0 => 813
  1 => 812

In [10]:
# randomly undersample imbalanced data 
df_vr_tumour_h0 = random_undersample(df_vr_tumour_h0)

Unnamed: 0_level_0,ids,times,labels,feature_1,feature_2,feature_3,feature_4,feature_5
Unnamed: 0_level_1,Int64,Int64,Any,Float64,Float64,Float64,Float64,Float64
1,91,300,0,0.458359,0.461496,0.013393,2.11021e-6,5.00343e-12
2,1571,450,1,0.675793,0.692533,0.0213001,3.79939e-6,1.16308e-11
3,641,300,0,0.420838,0.430217,0.0137939,3.17459e-6,1.95533e-11
4,781,350,0,0.331712,0.404442,0.0244088,2.34428e-5,7.30758e-10
5,1491,350,1,0.183007,0.254272,0.0189514,1.3159e-5,1.85992e-10
6,1511,450,1,0.374143,0.455698,0.0289253,2.80537e-5,8.30814e-10
7,1551,400,1,0.495057,0.543941,0.0224264,9.55764e-6,1.2368e-10
8,6120,350,0,0.548479,0.545557,0.0155623,2.40914e-6,5.6079e-12
9,531,400,0,0.557761,0.57479,0.0186424,4.28215e-6,2.53164e-11
10,1281,350,1,0.176635,0.277255,0.0366505,8.6886e-5,5.53405e-9


<b>To Jay: Please note I changed the train/ test split ratio to 0.7. This is because we use the RandomizedSearchCV, which will do its own cross validation when doing hyperparameter tuning </b>

In [14]:
sampler(a, b) = stats.loguniform(a, b) # samples from range (a,b) according to log uniform distribution 

# specify parameters and distributions to sample from
param_dist = Dict("C" => sampler(1e-6, 100.0),
                  "gamma" => sampler(1e-6, 100.0))

# define classifier
clf = SVC()

# Do random parameter search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, 
                                    param_dist,
                                   n_iter=n_iter_search, 
                                    random_state=MersenneTwister(42))

start = time()
ScikitLearn.fit!(random_search, Xtrain, ytrain)
@printf("RandomizedSearchCV took %.2f seconds for %d candidates, parameter settings.\n", (time() - start), n_iter_search)

report(random_search.grid_scores_)

LoadError: UndefVarError: Xtrain not defined

<b>We'll then train SVM on the entire training data and report the accuracy on the test set </b>

In [None]:

# report final accuracy on test set

model = svmtrain(Xtrain', ytrain ; gamma=G , cost=C)
ypredict, decision_values = svmpredict(model, Xtest');
acc = mean(ypredict .== ytest) * 100;
println("accuracy: ", acc)
println("Cost: ", C, " Gamma: ", G)
display(accs)