# Intro to Machine Learning with Julia 

In [1]:
iris_datapath = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

In [2]:
using CSV, DataFrames, Downloads

In [3]:
# Import iris data
iris_data = DataFrame(CSV.File(Downloads.download(iris_datapath), header = 0));
first(iris_data, 5)

Unnamed: 0_level_0,Column1,Column2,Column3,Column4,Column5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


## Logistic Regression

In [4]:
# Clean species name
iris_data.Column5 = replace(species -> contains(species, "-") ? replace(species, "-" => "_") : species, iris_data[!, "Column5"]);
first(iris_data, 10)

Unnamed: 0_level_0,Column1,Column2,Column3,Column4,Column5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String
1,5.1,3.5,1.4,0.2,Iris_setosa
2,4.9,3.0,1.4,0.2,Iris_setosa
3,4.7,3.2,1.3,0.2,Iris_setosa
4,4.6,3.1,1.5,0.2,Iris_setosa
5,5.0,3.6,1.4,0.2,Iris_setosa
6,5.4,3.9,1.7,0.4,Iris_setosa
7,4.6,3.4,1.4,0.3,Iris_setosa
8,5.0,3.4,1.5,0.2,Iris_setosa
9,4.4,2.9,1.4,0.2,Iris_setosa
10,4.9,3.1,1.5,0.1,Iris_setosa


In [5]:
# Rename columns
col_names = ["Sepal_Length","Sepal_Width","Petal_Length","Petal_Width","Species"];
rename!(iris_data, col_names);
first(iris_data, 5)

Unnamed: 0_level_0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String
1,5.1,3.5,1.4,0.2,Iris_setosa
2,4.9,3.0,1.4,0.2,Iris_setosa
3,4.7,3.2,1.3,0.2,Iris_setosa
4,4.6,3.1,1.5,0.2,Iris_setosa
5,5.0,3.6,1.4,0.2,Iris_setosa


In [6]:
# One Hot encode the dependent variable
unique_species = unique(iris_data.Species);
iris_data = transform(iris_data, @. :Species => ByRow(isequal(unique_species)) .=> Symbol(:Species_, unique_species));
iris_data = select!(iris_data, Not([:Species]));
first(iris_data, 5)

Unnamed: 0_level_0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species_Iris_setosa,Species_Iris_versicolor
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Bool,Bool
1,5.1,3.5,1.4,0.2,1,0
2,4.9,3.0,1.4,0.2,1,0
3,4.7,3.2,1.3,0.2,1,0
4,4.6,3.1,1.5,0.2,1,0
5,5.0,3.6,1.4,0.2,1,0


In [7]:
# Train and test split
using Random
sample = randsubseq(1:size(iris_data, 1), 0.75)
notsample = [i for i in 1:size(iris_data, 1) if isempty(searchsorted(sample, i))]
train = iris_data[sample, :]
test = iris_data[notsample, :]

Unnamed: 0_level_0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species_Iris_setosa,Species_Iris_versicolor
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Bool,Bool
1,5.0,3.6,1.4,0.2,1,0
2,4.4,2.9,1.4,0.2,1,0
3,5.4,3.7,1.5,0.2,1,0
4,4.8,3.4,1.6,0.2,1,0
5,5.4,3.4,1.7,0.2,1,0
6,4.6,3.6,1.0,0.2,1,0
7,4.8,3.4,1.9,0.2,1,0
8,5.2,3.5,1.5,0.2,1,0
9,5.2,3.4,1.4,0.2,1,0
10,5.5,4.2,1.4,0.2,1,0


In [8]:
# Logistic regression
using GLM

fm_setosa = @formula(Species_Iris_setosa ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width)
lm_setosa = glm(fm_setosa, train, Binomial(), LogitLink())
pred_setosa = predict(lm_setosa, test)

fm_virginica = @formula(Species_Iris_virginica ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width)
lm_virginica = glm(fm_virginica, train, Binomial(), LogitLink())
pred_virginica = predict(lm_virginica, test)

fm_versicolor = @formula(Species_Iris_versicolor ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width)
lm_versicolor = glm(fm_versicolor, train, Binomial(), LogitLink())
pred_versicolor = predict(lm_versicolor, test)

preds = hcat(pred_setosa, pred_virginica, pred_versicolor)

34×3 Matrix{Float64}:
 1.0          0.0           0.0940977
 1.0          0.0           0.352918
 1.0          0.0           0.0776083
 1.0          0.0           0.173498
 1.0          0.0           0.166322
 1.0          0.0           0.0690912
 1.0          0.0           0.226535
 1.0          0.0           0.119859
 1.0          0.0           0.13113
 1.0          0.0           0.0245498
 1.0          0.0           0.163623
 1.0          0.0           0.282715
 1.0          0.0           0.0842091
 ⋮                          
 4.61947e-15  5.36734e-133  0.694729
 9.46312e-11  3.35738e-256  0.414942
 1.44936e-12  7.54896e-239  0.445985
 1.36769e-24  1.0           0.455392
 4.52969e-20  1.0           0.148014
 8.89702e-29  1.0           0.167305
 3.62247e-19  1.0           0.40128
 8.58807e-20  1.0           0.379731
 7.4295e-21   1.0           0.177116
 6.66337e-21  1.0           0.0896135
 3.12492e-26  1.0           0.165617
 6.92614e-23  1.0           0.125692

In [9]:
species_map = Dict([1 => "setosa", 2 => "virginica", 3 => "versicolor"]);
pred_class = String[];
for i in 1:size(preds, 1)
    A = preds[i, :]
    index = findall(x-> x == maximum(A), A)[1]
    pred_class = vcat(pred_class, species_map[index])
end
pred_class

34-element Vector{String}:
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 ⋮
 "versicolor"
 "versicolor"
 "versicolor"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"

In [10]:
# Compute model accuracy
species_map = Dict([1 => "setosa", 2 => "virginica", 3 => "versicolor"]);
orig_class = String[]
M = Matrix(test[!, 5:7])
for i in 1:size(M, 1)
    A = M[i, :]
    index = findall(x-> x == maximum(A), A)[1]
    orig_class = vcat(orig_class, species_map[index])
end
orig_class

34-element Vector{String}:
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 "setosa"
 ⋮
 "virginica"
 "virginica"
 "virginica"
 "versicolor"
 "versicolor"
 "versicolor"
 "versicolor"
 "versicolor"
 "versicolor"
 "versicolor"
 "versicolor"
 "versicolor"

In [11]:
correct = 0
for i in 1:length(orig_class)
   correct += pred_class[i] == orig_class[i] ? 1 : 0
end
println("Accuracy: ", round(correct / length(orig_class), digits = 4))

Accuracy: 0.4118


## Scikit Learn

In [12]:
# Import iris data
iris_data = DataFrame(CSV.File(Downloads.download(iris_datapath), header = 0));

# Clean species name
iris_data.Column5 = replace(species -> contains(species, "-") ? replace(species, "-" => "_") : species, iris_data[!, "Column5"]);

# Rename columns
col_names = ["Sepal_Length","Sepal_Width","Petal_Length","Petal_Width","Species"];
rename!(iris_data, col_names);

# Train and test split
train = iris_data[sample, :];
test = iris_data[notsample, :];

x_train, y_train = Array(train[:, 1:4]), Array(train[:, 5]);

x_test, y_test = Array(test[:, 1:4]), Array(test[:, 5]);

In [13]:
first(train, 5)

Unnamed: 0_level_0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String
1,5.1,3.5,1.4,0.2,Iris_setosa
2,4.9,3.0,1.4,0.2,Iris_setosa
3,4.7,3.2,1.3,0.2,Iris_setosa
4,4.6,3.1,1.5,0.2,Iris_setosa
5,5.4,3.9,1.7,0.4,Iris_setosa


In [15]:
using ScikitLearn
using ScikitLearn: fit!

# Fit model
@sk_import linear_model: LogisticRegression

log_reg = fit!(LogisticRegression(), x_train, y_train)

PyObject LogisticRegression()

In [17]:
# Predict
sklearn_pred = log_reg.predict(x_test)

PyObject array(['Iris_setosa', 'Iris_setosa', 'Iris_setosa', 'Iris_setosa',
       'Iris_setosa', 'Iris_setosa', 'Iris_setosa', 'Iris_setosa',
       'Iris_setosa', 'Iris_setosa', 'Iris_setosa', 'Iris_setosa',
       'Iris_setosa', 'Iris_versicolor', 'Iris_versicolor',
       'Iris_versicolor', 'Iris_versicolor', 'Iris_versicolor',
       'Iris_versicolor', 'Iris_versicolor', 'Iris_virginica',
       'Iris_versicolor', 'Iris_versicolor', 'Iris_versicolor',
       'Iris_versicolor', 'Iris_virginica', 'Iris_virginica',
       'Iris_virginica', 'Iris_virginica', 'Iris_virginica',
       'Iris_virginica', 'Iris_virginica', 'Iris_virginica',
       'Iris_virginica'], dtype='<U15')

In [18]:
# Compute accuracy
correct = 0
for i in 1:length(y_test)
    correct += y_test[i] == sklearn_pred[i] ? 1 : 0
end
println("Accuracy: ", round(correct / length(y_test), digits = 4))

Accuracy: 0.9706


## MLJ

In [40]:
# load the model
using MLJ
@load SVC pkg=LIBSVM verbosity=1

import MLJLIBSVMInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\isossa\.julia\packages\MLJModels\w0uSt\src\loading.jl:168


MLJLIBSVMInterface.SVC

In [42]:
# create a so-called machine
svc_mdl = MLJLIBSVMInterface.SVC()
svc = machine(svc_mdl, x_train, categorical(y_train))

# fit the model
MLJ.fit!(svc);

# predict on the test set
yhat = MLJ.predict(svc, x_test);

│ scitype(X) = AbstractMatrix{Continuous}
│ input_scitype(model) = Table{<:AbstractVector{<:Continuous}}.
└ @ MLJBase C:\Users\isossa\.julia\packages\MLJBase\KWyqX\src\machines.jl:91
┌ Info: Training [34mMachine{SVC,…} @872[39m.
└ @ MLJBase C:\Users\isossa\.julia\packages\MLJBase\KWyqX\src\machines.jl:342


In [43]:
# Compute accuracy
correct = 0
for i in 1:length(y_test)
    correct += y_test[i] == yhat[i] ? 1 : 0
end
println("Accuracy: ", round(correct / length(y_test), digits = 4))

Accuracy: 0.9706
