In [36]:
using CSV, DataFrames, Gadfly, GLM, Statistics, Distributions, LinearAlgebra, Random
ENV["COLUMNS"] = 1000;

every = DataFrame(Pourcentage = Float64[], FN = Float64[], FP = Float64[], Seuil = Float64[])

function findCutoff(table, column, newColumn, lowRange, highRange, nIteration, reverse=false)
    tempTable = select(table, column, newColumn, :HeartDisease)
    maxValue = 0
    maxIt = 0
    minValue = 1.0
    minIt = 0
    minmaxValue = 0
    minmaxIt = 0
    if lowRange >= highRange
        error("range invalid")
    end
    n = size(table, 1)
    for l in 0.0:nIteration
        j = lowRange + ((highRange - lowRange)/nIteration * l) # itère nIteration fois entre lowRange et highRange
        for i in 1:n
            if table[!,column] != Missing && table[!,column][i] < j #selon la valeur de j, on classe les données
                table[!,newColumn][i] = reverse ? "1" : "0" #on swap les classes si c'est inversement proportionnel pour les meilleurs seuils
            else
                table[!,newColumn][i] = reverse ? "0" : "1"
            end
        end
        tempTable = combine(groupby(table, [newColumn]), :HeartDisease => mean => :Odds) 

        if size(tempTable, 1) > 1
            if tempTable.Odds[1] < minValue # on trouve le meilleur seuil pour la classe 0
                minIt = j
                minValue = tempTable.Odds[1]
            end 

            if tempTable.Odds[2] > maxValue # on trouve le meilleur seuil pour la classe 1
                maxIt = j
                maxValue = tempTable.Odds[2] 
            end 

            # on trouve le meilleur seuil pour le produit des 2 classes, pour un seuil acceptable à la fois pour les classes 1 et 2
            if (1 - tempTable.Odds[1]) * tempTable.Odds[2] > minmaxValue && tempTable.Odds[1] != 0 && tempTable.Odds[2] != 1
                minmaxIt = j
                minmaxValue = (1 - tempTable.Odds[1]) * tempTable.Odds[2]
            end 
        end 
    end
    return [[
        minIt,
        minValue],[
        maxIt,
        maxValue],[
        minmaxIt,
        minmaxValue
    ]]
end

full = CSV.read("train.csv", DataFrame);

for i in 0:30

    fn_tot = []
    fp_tot = []
    total = []

    testCutoff = (0.3 + i/100)

    for i in 1:50
        full = full[shuffle(1:size(full, 1)),:];
        full[!, :FastingBS] = string.(full[:, :FastingBS]);  # Pour que GLM considère la classe comme une variable qualitative
        number = size(full, 1);
        twoThirds = floor(Int, 2/3*number);
        data = first(full,twoThirds);
        estimate = last(full, number-twoThirds);

        data[!, :FastingBS] = string.(data[:, :FastingBS]);  # Pour que GLM considère la classe comme une variable qualitative
        data[:, :OldpeakGroups] .= "2"
        data[:, :AgeGroups] .= "2"
        data[:, :CholesterolGroups] .= "2"
        data[:, :MaxHRGroups] .= "2"
        maxHRCutoffs = findCutoff(data, :MaxHR, :MaxHRGroups, 50.0, 200.0, 1000.0, true)
        ageCutoff = findCutoff(data, :Age, :AgeGroups, 0.0, 100.0, 1000.0)
        oldPeakCutoffs = findCutoff(data, :Oldpeak, :OldpeakGroups, 0.0, 2.0, 1000.0)
        for i in 1:size(data,1)
            if data.Oldpeak[i] < oldPeakCutoffs[3][1]
                data.OldpeakGroups[i] = "0"
            else
                data.OldpeakGroups[i] = "1"
            end
            if data.Age[i] < ageCutoff[3][1]
                data.AgeGroups[i] = "0"
            else
                data.AgeGroups[i] = "1"
            end
            if ismissing(data.Cholesterol[i])
                data.CholesterolGroups[i] = "0"
            else
                data.CholesterolGroups[i] = "1"
            end
            if(data.MaxHR[i] < maxHRCutoffs[3][1])
                data.MaxHRGroups[i] = "0"
            else
                data.MaxHRGroups[i] = "1"
            end
        end

        #M = glm(@formula(HeartDisease ~ ChestPainType + OldpeakGroups + AgeGroups + CholesterolGroups + MaxHRGroups + Sex + ExerciseAngina + STSlope), data,  Bernoulli(), LogitLink())
        M = glm(@formula(HeartDisease ~  ChestPainType + OldpeakGroups + AgeGroups + CholesterolGroups + MaxHRGroups + Sex + RestingECG + FastingBS + ExerciseAngina + STSlope), data,  Bernoulli(), LogitLink())

        estimate[:, :OldpeakGroups] .= "2"
        estimate[:, :AgeGroups] .= "2"
        estimate[:, :CholesterolGroups] .= "2"
        estimate[:, :MaxHRGroups] .= "2"
        for i in 1:size(estimate,1)
            if estimate.Oldpeak[i] < oldPeakCutoffs[3][1]
                estimate.OldpeakGroups[i] = "0"
            else
                estimate.OldpeakGroups[i] = "1"
            end
            if estimate.Age[i] < ageCutoff[3][1]
                estimate.AgeGroups[i] = "0"
            else
                estimate.AgeGroups[i] = "1"
            end
            if ismissing(estimate.Cholesterol[i])
                estimate.CholesterolGroups[i] = "0"
            else
                estimate.CholesterolGroups[i] = "1"
            end
            if(estimate.MaxHR[i] < maxHRCutoffs[3][1])
                estimate.MaxHRGroups[i] = "0"
            else
                estimate.MaxHRGroups[i] = "1"
            end
        end

        θestim = predict(M, estimate);

        evalua = zeros(Int64, size(estimate,1))
        n = size(estimate, 1)
        counter = 0

        vp = 0
        vn = 0
        fn = 0
        fp = 0

        arrayfn = []
        arrayfp = []

        for i in 1:n
            if (θestim[i] >= testCutoff && estimate[i, :].HeartDisease == 1)
                counter = counter + 1
                vp = vp + 1
            elseif (θestim[i] < testCutoff && estimate[i, :].HeartDisease == 0)
                counter = counter + 1
                vn = vn + 1
            elseif (θestim[i] < testCutoff && estimate[i, :].HeartDisease == 1)
                fn = fn + 1
                push!(arrayfn, i)
            elseif (θestim[i] >= testCutoff && estimate[i, :].HeartDisease == 0)
                fp = fp + 1
                push!(arrayfp, i)
            end
        end

        push!(fp_tot, fp)
        push!(fn_tot, fn)

        push!(total, counter/n)
    end

    tot = 0
    for i in 1:size(total,1)
        tot = tot + total[i]
    end

    dab1 = tot/size(total,1)

    fpmean = 0
    for i in 1:size(fp_tot,1)
        fpmean = fpmean + fp_tot[i]
    end

    dab2 = fpmean/size(fp_tot,1)

    fnmean = 0
    for i in 1:size(fn_tot,1)
        fnmean = fnmean + fn_tot[i]
    end

    dab3 = fnmean/size(fn_tot,1)
    
    push!(every, [dab1, dab2, dab3, testCutoff])
    
end


In [37]:
every

Unnamed: 0_level_0,Pourcentage,FN,FP,Seuil
Unnamed: 0_level_1,Float64,Float64,Float64,Float64
1,0.861565,24.04,7.8,0.3
2,0.861391,24.04,7.84,0.31
3,0.864087,23.7,7.56,0.32
4,0.864,23.14,8.14,0.33
5,0.867652,22.06,8.38,0.34
6,0.864522,21.82,9.34,0.35
7,0.869652,21.22,8.76,0.36
8,0.865391,21.1,9.86,0.37
9,0.867217,20.1,10.44,0.38
10,0.864957,21.22,9.84,0.39
