# Dependencies

In [1]:
using ScikitLearn, CSV, DataFrames, PyCall;
@sk_import preprocessing: LabelEncoder;
@sk_import naive_bayes: (CategoricalNB, GaussianNB);
@sk_import metrics: accuracy_score;
@sk_import tree: DecisionTreeClassifier;
@sk_import svm: SVC;
@sk_import neural_network: MLPClassifier;
@sk_import feature_selection: (SelectKBest, chi2);

In [2]:
train_test_split = ScikitLearn.CrossValidation.train_test_split;
le = LabelEncoder();
pickle = pyimport("pickle");

# Car Eval Data Set
## ETL

In [3]:
# Read car data csv into DataFrame
car_columns = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"];
car_df = CSV.File("./data/car.data", header=car_columns) |> DataFrame;
first(car_df, 5)

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety,class
Unnamed: 0_level_1,String,String,String,String,String,String,String
1,vhigh,vhigh,2,2,small,low,unacc
2,vhigh,vhigh,2,2,small,med,unacc
3,vhigh,vhigh,2,2,small,high,unacc
4,vhigh,vhigh,2,2,med,low,unacc
5,vhigh,vhigh,2,2,med,med,unacc


## Preprocessing

In [4]:
# Encode values for car data
for col in car_columns
    car_df[!,col] = le.fit_transform(car_df[!,col]);
end
first(car_df, 5)

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety,class
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,3,3,0,0,2,1,2
2,3,3,0,0,2,2,2
3,3,3,0,0,2,0,2
4,3,3,0,0,1,1,2
5,3,3,0,0,1,2,2


In [5]:
# Feature and Label extraction
car_X = convert(Array, car_df[:,1:6]);
car_y = convert(Array, car_df[:, 7]);

In [6]:
# Split data into testing and training subsets
car_X_train, car_X_test, car_y_train, car_y_test = train_test_split(car_X, car_y, test_size=0.3, random_state=100);

## Model Testing

In [7]:
# Naive Bayesian Classifier
nb_car_model = CategoricalNB();
@time fit!(nb_car_model, car_X_train, car_y_train);
@time nb_car_predictions = predict(nb_car_model, car_X_test);
nb_car_accuracy = accuracy_score(nb_car_predictions, car_y_test);
println("Naive Bayesian Classifier Accuracy: $(round(nb_car_accuracy * 100, digits=1))%");
io = open("./model/nb_car_model.sav", "w");
pickle.dump(nb_car_model, io);
close(io);

  0.034393 seconds (45.01 k allocations: 2.378 MiB)
  0.014342 seconds (28.78 k allocations: 1.558 MiB)
Naive Bayesian Classifier Accuracy: 85.5%


In [8]:
# Decision Tree Classifier
tree_car_model = DecisionTreeClassifier();
@time fit!(tree_car_model, car_X_train, car_y_train);
@time tree_car_predictions = predict(tree_car_model, car_X_test);
tree_car_accuracy = accuracy_score(tree_car_predictions, car_y_test);
println("Decision Tree Classifier Accuracy: $(round(tree_car_accuracy * 100, digits=1))%");
io = open("./model/tree_car_model.sav", "w");
pickle.dump(tree_car_model, io);
close(io);

  0.001249 seconds (21 allocations: 1.109 KiB)
  0.000344 seconds (36 allocations: 5.875 KiB)
Decision Tree Classifier Accuracy: 98.1%


In [9]:
# Support Vector Machine
svm_car_model = SVC();
@time fit!(svm_car_model, car_X_train, car_y_train);
@time svm_car_predictions = predict(svm_car_model, car_X_test);
svm_car_accuracy = accuracy_score(svm_car_predictions, car_y_test);
println("Support Vector Machine Accuracy: $(round(svm_car_accuracy * 100, digits=1))%");
io = open("./model/svm_car_model.sav", "w");
pickle.dump(svm_car_model, io);
close(io);

  0.056278 seconds (21 allocations: 1.109 KiB)
  0.035555 seconds (36 allocations: 5.875 KiB)
Support Vector Machine Accuracy: 93.1%


In [10]:
# Neural Network
nn_car_model = MLPClassifier(max_iter=1000);
@time fit!(nn_car_model, car_X_train, car_y_train);
@time nn_car_predictions = predict(nn_car_model, car_X_test);
nn_car_accuracy = accuracy_score(nn_car_predictions, car_y_test);
println("Neural Network Accuracy: $(round(nn_car_accuracy * 100, digits=1))%");
io = open("./model/nn_car_model.sav", "w");
pickle.dump(nn_car_model, io);
close(io);

  2.823061 seconds (21 allocations: 1.109 KiB)
  0.000854 seconds (36 allocations: 5.875 KiB)
Neural Network Accuracy: 97.3%


# Abalone Data Set
## ETL

In [11]:
# Read abalone data csv into DataFrame
abalone_columns = ["sex", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight", "rings"];
abalone_df = CSV.File("./data/abalone.data", header=abalone_columns) |> DataFrame;
first(abalone_df, 5)

Unnamed: 0_level_0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64,Float64,Float64
1,M,0.455,0.365,0.095,0.514,0.2245,0.101
2,M,0.35,0.265,0.09,0.2255,0.0995,0.0485
3,F,0.53,0.42,0.135,0.677,0.2565,0.1415
4,M,0.44,0.365,0.125,0.516,0.2155,0.114
5,I,0.33,0.255,0.08,0.205,0.0895,0.0395


## Preprocessing

In [12]:
# Encode sex
abalone_df[!,:sex] = le.fit_transform(abalone_df[!,:sex]);
first(abalone_df, 5);

In [13]:
# Categorize rings into 3 groups based on age
index = 1;
for ring in abalone_df[!,:rings]
    age = ring + 1.5;
    if 1 < age < 9
        abalone_df[index,:rings] = 1;
    elseif age < 10
        abalone_df[index,:rings] = 2;
    else
        abalone_df[index,:rings] = 3;
    end
    index += 1;
end

In [14]:
# Feature and label extraction
abalone_X_features = convert(Array, abalone_df[:,1:8]);
abalone_y_labels = convert(Array, abalone_df[:, 9]);

In [15]:
# Split data into testing and training subsets
abalone_X_train, abalone_X_test, abalone_y_train, abalone_y_test = train_test_split(abalone_X_features, abalone_y_labels, test_size=0.3);

## Model Testing

In [16]:
# Naive Bayesian Classifier
nb_abalone_model = GaussianNB();
@time fit!(nb_abalone_model, abalone_X_train, abalone_y_train);
@time nb_abalone_predictions = predict(nb_abalone_model, abalone_X_test);
nb_abalone_accuracy = accuracy_score(nb_abalone_predictions, abalone_y_test);
println("Naive Bayesian Classifier Accuracy: $(round(nb_abalone_accuracy * 100, digits=1))%");
io = open("./model/nb_abalone_model.sav", "w");
pickle.dump(nb_abalone_model, io);
close(io);

  0.019388 seconds (35.37 k allocations: 1.882 MiB)
  0.017520 seconds (28.43 k allocations: 1.538 MiB)
Naive Bayesian Classifier Accuracy: 67.9%


In [17]:
# Decision Tree Classifier
tree_abalone_model = DecisionTreeClassifier();
@time fit!(tree_abalone_model, abalone_X_train, abalone_y_train);
@time tree_abalone_predictions = predict(tree_abalone_model, abalone_X_test);
tree_abalone_accuracy = accuracy_score(tree_abalone_predictions, abalone_y_test);
println("Decision Tree Classifier Accuracy: $(round(tree_abalone_accuracy * 100, digits=1))%");
io = open("./model/tree_abalone_model.sav", "w");
pickle.dump(tree_abalone_model, io);
close(io);

  0.013147 seconds (21 allocations: 1.109 KiB)
  0.000452 seconds (36 allocations: 11.625 KiB)
Decision Tree Classifier Accuracy: 73.5%


In [18]:
# Support Vector Machine
svm_abalone_model = SVC();
@time fit!(svm_abalone_model, abalone_X_train, abalone_y_train);
@time svm_abalone_predictions = predict(svm_abalone_model, abalone_X_test);
svm_abalone_accuracy = accuracy_score(svm_abalone_predictions, abalone_y_test);
println("Support Vector Machine Accuracy: $(round(svm_abalone_accuracy * 100, digits=1))%");
io = open("./model/svm_abalone_model.sav", "w");
pickle.dump(svm_abalone_model, io);
close(io);

  0.200879 seconds (21 allocations: 1.109 KiB)
  0.149749 seconds (36 allocations: 11.625 KiB)
Support Vector Machine Accuracy: 78.7%


In [19]:
# Neural Network
nn_abalone_model = MLPClassifier(max_iter=1000);
@time fit!(nn_abalone_model, abalone_X_train, abalone_y_train);
@time nn_abalone_predictions = predict(nn_abalone_model, abalone_X_test);
nn_abalone_accuracy = accuracy_score(nn_abalone_predictions, abalone_y_test);
println("Neural Network Accuracy: $(round(nn_abalone_accuracy * 100, digits=1))%");
io = open("./model/nn_abalone_model.sav", "w");
pickle.dump(nn_abalone_model, io);
close(io);

  1.396414 seconds (21 allocations: 1.109 KiB)
  0.001581 seconds (39 allocations: 16.234 KiB)
Neural Network Accuracy: 80.1%


# Madelon Data Set
## ETL

In [20]:
# Read madelon data csv into DataFrame
madelon_test_df = CSV.File("./data/madelon/madelon_test.data") |> DataFrame;
madelon_train_df = CSV.File("./data/madelon/madelon_train.data") |> DataFrame;
madleon_valid_df = CSV.File("./data/madelon/madelon_valid.data") |> DataFrame;
madelon_train_labels = CSV.File("./data/madelon/madelon_train.labels") |> DataFrame;

## Preprocessing

In [21]:
# Merge training features with training labels
madelon_train_df[!,501] = madelon_train_labels[!,1];

In [22]:
# Feature and label extraction
madelon_X_features = convert(Array, madelon_train_df[:,1:500]);
madelon_y_labels = convert(Array, madelon_train_df[:, 501]);

In [23]:
# Drop irrelevant features to avoid overfitting
madelon_X_features = SelectKBest(chi2, k=20).fit_transform(madelon_X_features, madelon_y_labels);

In [24]:
# Split data into testing and training subsets
madelon_X_train, madelon_X_test, madelon_y_train, madelon_y_test = train_test_split(madelon_X_features, madelon_y_labels, test_size=0.3);

## Model Testing

In [25]:
# Naive Bayesian Classifier
nb_madelon_model = GaussianNB()
@time fit!(nb_madelon_model, madelon_X_train, madelon_y_train)
@time nb_madelon_predictions = predict(nb_madelon_model, madelon_X_test)
nb_madelon_accuracy = accuracy_score(nb_madelon_predictions, madelon_y_test)
println("Naive Bayesian Classifier Accuracy: $(round(nb_madelon_accuracy * 100, digits=1))%")
io = open("./model/nb_madelon_model.sav", "w")
pickle.dump(nb_madelon_model, io)
close(io)

  0.001221 seconds (21 allocations: 1.109 KiB)
  0.000501 seconds (36 allocations: 6.500 KiB)
Naive Bayesian Classifier Accuracy: 56.0%


In [26]:
# Decision Tree Classifier
tree_madelon_model = DecisionTreeClassifier()
@time fit!(tree_madelon_model, madelon_X_train, madelon_y_train)
@time tree_madelon_predictions = predict(tree_madelon_model, madelon_X_test)
tree_madelon_accuracy = accuracy_score(tree_madelon_predictions, madelon_y_test)
println("Decision Tree Classifier Accuracy: $(round(tree_madelon_accuracy * 100, digits=1))%")
io = open("./model/tree_madelon_model.sav", "w")
pickle.dump(tree_madelon_model, io)
close(io)

  0.016816 seconds (21 allocations: 1.109 KiB)
  0.000367 seconds (36 allocations: 6.500 KiB)
Decision Tree Classifier Accuracy: 76.3%


In [27]:
# Support Vector Machine
svm_abalone_model = SVC()
@time fit!(svm_abalone_model, madelon_X_train, madelon_y_train)
@time svm_abalone_predictions = predict(svm_abalone_model, madelon_X_test)
svm_abalone_accuracy = accuracy_score(svm_abalone_predictions, madelon_y_test)
println("Support Vector Machine Accuracy: $(round(svm_abalone_accuracy * 100, digits=1))%")
io = open("./model/svm_abalone_model.sav", "w")
pickle.dump(svm_abalone_model, io)
close(io)

  0.076111 seconds (21 allocations: 1.109 KiB)
  0.053031 seconds (36 allocations: 6.500 KiB)
Support Vector Machine Accuracy: 78.8%


In [28]:
# Neural Network
nn_abalone_model = MLPClassifier(max_iter=1000)
@time fit!(nn_abalone_model, madelon_X_train, madelon_y_train)
@time nn_abalone_predictions = predict(nn_abalone_model, madelon_X_test)
nn_abalone_accuracy = accuracy_score(nn_abalone_predictions, madelon_y_test)
println("Neural Network Accuracy: $(round(nn_abalone_accuracy * 100, digits=1))%")
io = open("./model/nn_abalone_model.sav", "w")
pickle.dump(nn_abalone_model, io)
close(io)

  0.273388 seconds (21 allocations: 1.109 KiB)
  0.000504 seconds (36 allocations: 6.500 KiB)
Neural Network Accuracy: 64.0%


# KDD Data Set
## ETL

In [29]:
kdd_df = CSV.File("./data/kddcup.data") |> DataFrame;
first(kdd_df, 5)

Unnamed: 0_level_0,0,tcp,http,SF,181,5450,0_1,0_2,0_3,0_4,0_5
Unnamed: 0_level_1,Int64,String,String,String,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,0,tcp,http,SF,239,486,0,0,0,0,0
2,0,tcp,http,SF,235,1337,0,0,0,0,0
3,0,tcp,http,SF,219,1337,0,0,0,0,0
4,0,tcp,http,SF,217,2032,0,0,0,0,0
5,0,tcp,http,SF,217,2032,0,0,0,0,0


## Preprocessing

In [30]:
kdd_df[!,2] = le.fit_transform(kdd_df[!,2])
kdd_df[!,3] = le.fit_transform(kdd_df[!,3])
kdd_df[!,4] = le.fit_transform(kdd_df[!,4])
first(kdd_df, 5)

Unnamed: 0_level_0,0,tcp,http,SF,181,5450,0_1,0_2,0_3,0_4,0_5,1
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,0,1,22,9,239,486,0,0,0,0,0,1
2,0,1,22,9,235,1337,0,0,0,0,0,1
3,0,1,22,9,219,1337,0,0,0,0,0,1
4,0,1,22,9,217,2032,0,0,0,0,0,1
5,0,1,22,9,217,2032,0,0,0,0,0,1


In [31]:
kdd_X_features = convert(Array, kdd_df[!,1:41]);
kdd_y_labels = convert(Array, kdd_df[!,42]);

In [32]:
kdd_X_train, kdd_X_test, kdd_y_train, kdd_y_test = train_test_split(kdd_X_features, kdd_y_labels, test_size=0.3, random_state=42);

## Model Testing

In [33]:
# Naive Bayesian Classifier
nb_kdd_model = GaussianNB()
@time fit!(nb_kdd_model, kdd_X_train, kdd_y_train)
@time nb_kdd_preditions = predict(nb_kdd_model, kdd_X_test)
nb_kdd_accuracy = accuracy_score(nb_kdd_preditions, kdd_y_test)
println("Naive Bayesian Classifier Accuracy: $(round(nb_kdd_accuracy * 100, digits=1))%")
io = open("./model/nb_kdd_model.sav", "w")
pickle.dump(nb_kdd_model, io)
close(io)

  0.694945 seconds (351.06 k allocations: 5.559 MiB)
  1.824012 seconds (1.06 M allocations: 34.208 MiB, 14.89% gc time)
Naive Bayesian Classifier Accuracy: 93.7%


In [34]:
# Decision Tree Classifier
tree_kdd_model = DecisionTreeClassifier()
@time fit!(tree_kdd_model, kdd_X_train, kdd_y_train)
@time tree_kdd_predictions = predict(tree_kdd_model, kdd_X_test)
tree_kdd_accuracy = accuracy_score(tree_kdd_predictions, kdd_y_test)
println("Decision Tree Classifier Accuracy: $(round(tree_kdd_accuracy * 100, digits=1))%")
io = open("./model/tree_kdd_model.sav", "w")
pickle.dump(tree_kdd_model, io)
close(io)

  1.652645 seconds (345.83 k allocations: 5.278 MiB)
  0.212319 seconds (889.29 k allocations: 25.617 MiB)
Decision Tree Classifier Accuracy: 100.0%


In [35]:
# Support Vector Machine
svm_kdd_model = SVC()
@time fit!(svm_kdd_model, madelon_X_train, madelon_y_train)
@time svm_kdd_predictions = predict(svm_kdd_model, madelon_X_test)
svm_kdd_accuracy = accuracy_score(svm_kdd_predictions, madelon_y_test)
println("Support Vector Machine Accuracy: $(round(svm_kdd_accuracy * 100, digits=1))%")
io = open("./model/svm_kdd_model.sav", "w")
pickle.dump(svm_kdd_model, io)
close(io)

  0.074761 seconds (21 allocations: 1.109 KiB)
  0.058261 seconds (36 allocations: 6.500 KiB)
Support Vector Machine Accuracy: 78.8%


In [36]:
# Neural Network
nn_kdd_model = MLPClassifier(max_iter=1000)
@time fit!(nn_kdd_model, madelon_X_train, madelon_y_train)
@time nn_kdd_predictions = predict(nn_kdd_model, madelon_X_test)
nn_kdd_accuracy = accuracy_score(nn_kdd_predictions, madelon_y_test)
println("Neural Network Accuracy: $(round(nn_kdd_accuracy * 100, digits=1))%")
io = open("./model/nn_kdd_model.sav", "w")
pickle.dump(nn_kdd_model, io)
close(io)

  0.356498 seconds (21 allocations: 1.109 KiB)
  0.000580 seconds (36 allocations: 6.500 KiB)
Neural Network Accuracy: 60.5%
