# Dependencies

In [None]:
using ScikitLearn, CSV, DataFrames, PyCall;
@sk_import preprocessing: LabelEncoder;
@sk_import naive_bayes: (CategoricalNB, GaussianNB);
@sk_import metrics: accuracy_score;
@sk_import tree: DecisionTreeClassifier;
@sk_import svm: SVC;
@sk_import neural_network: MLPClassifier;
@sk_import feature_selection: (SelectKBest, chi2);

In [None]:
train_test_split = ScikitLearn.CrossValidation.train_test_split;
le = LabelEncoder();
pickle = pyimport("pickle");

# Car Eval Data Set
## ETL

In [None]:
# Read car data csv into DataFrame
car_columns = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"];
car_df = CSV.File("./data/car.data", header=car_columns) |> DataFrame;
first(car_df, 5)

## Preprocessing

In [None]:
# Encode values for car data
for col in car_columns
    car_df[!,col] = le.fit_transform(car_df[!,col]);
end
first(car_df, 5)

In [None]:
# Feature and Label extraction
car_X = convert(Array, car_df[:,1:6]);
car_y = convert(Array, car_df[:, 7]);

In [None]:
# Split data into testing and training subsets
car_X_train, car_X_test, car_y_train, car_y_test = train_test_split(car_X, car_y, test_size=0.3, random_state=100);

## Model Testing

In [None]:
# Naive Bayesian Classifier
nb_car_model = CategoricalNB();
@time fit!(nb_car_model, car_X_train, car_y_train);
@time nb_car_predictions = predict(nb_car_model, car_X_test);
nb_car_accuracy = accuracy_score(nb_car_predictions, car_y_test);
println("Naive Bayesian Classifier Accuracy: $(round(nb_car_accuracy * 100, digits=1))%");
io = open("./model/nb_car_model.sav", "w");
pickle.dump(nb_car_model, io);
close(io);

In [None]:
# Decision Tree Classifier
tree_car_model = DecisionTreeClassifier();
@time fit!(tree_car_model, car_X_train, car_y_train);
@time tree_car_predictions = predict(tree_car_model, car_X_test);
tree_car_accuracy = accuracy_score(tree_car_predictions, car_y_test);
println("Decision Tree Classifier Accuracy: $(round(tree_car_accuracy * 100, digits=1))%");
io = open("./model/tree_car_model.sav", "w");
pickle.dump(tree_car_model, io);
close(io);

In [None]:
# Support Vector Machine
svm_car_model = SVC();
@time fit!(svm_car_model, car_X_train, car_y_train);
@time svm_car_predictions = predict(svm_car_model, car_X_test);
svm_car_accuracy = accuracy_score(svm_car_predictions, car_y_test);
println("Support Vector Machine Accuracy: $(round(svm_car_accuracy * 100, digits=1))%");
io = open("./model/svm_car_model.sav", "w");
pickle.dump(svm_car_model, io);
close(io);

In [None]:
# Neural Network
nn_car_model = MLPClassifier(max_iter=1000);
@time fit!(nn_car_model, car_X_train, car_y_train);
@time nn_car_predictions = predict(nn_car_model, car_X_test);
nn_car_accuracy = accuracy_score(nn_car_predictions, car_y_test);
println("Neural Network Accuracy: $(round(nn_car_accuracy * 100, digits=1))%");
io = open("./model/nn_car_model.sav", "w");
pickle.dump(nn_car_model, io);
close(io);

# Abalone Data Set
## ETL

In [None]:
# Read abalone data csv into DataFrame
abalone_columns = ["sex", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight", "rings"];
abalone_df = CSV.File("./data/abalone.data", header=abalone_columns) |> DataFrame;
first(abalone_df, 5)

## Preprocessing

In [None]:
# Encode sex
abalone_df[!,:sex] = le.fit_transform(abalone_df[!,:sex]);
first(abalone_df, 5);

In [None]:
# Categorize rings into 3 groups based on age
index = 1;
for ring in abalone_df[!,:rings]
    age = ring + 1.5;
    if 1 < age < 9
        abalone_df[index,:rings] = 1;
    elseif age < 10
        abalone_df[index,:rings] = 2;
    else
        abalone_df[index,:rings] = 3;
    end
    index += 1;
end

In [None]:
# Feature and label extraction
abalone_X_features = convert(Array, abalone_df[:,1:8]);
abalone_y_labels = convert(Array, abalone_df[:, 9]);

In [None]:
# Split data into testing and training subsets
abalone_X_train, abalone_X_test, abalone_y_train, abalone_y_test = train_test_split(abalone_X_features, abalone_y_labels, test_size=0.3);

## Model Testing

In [None]:
# Naive Bayesian Classifier
nb_abalone_model = GaussianNB();
@time fit!(nb_abalone_model, abalone_X_train, abalone_y_train);
@time nb_abalone_predictions = predict(nb_abalone_model, abalone_X_test);
nb_abalone_accuracy = accuracy_score(nb_abalone_predictions, abalone_y_test);
println("Naive Bayesian Classifier Accuracy: $(round(nb_abalone_accuracy * 100, digits=1))%");
io = open("./model/nb_abalone_model.sav", "w");
pickle.dump(nb_abalone_model, io);
close(io);

In [None]:
# Decision Tree Classifier
tree_abalone_model = DecisionTreeClassifier();
@time fit!(tree_abalone_model, abalone_X_train, abalone_y_train);
@time tree_abalone_predictions = predict(tree_abalone_model, abalone_X_test);
tree_abalone_accuracy = accuracy_score(tree_abalone_predictions, abalone_y_test);
println("Decision Tree Classifier Accuracy: $(round(tree_abalone_accuracy * 100, digits=1))%");
io = open("./model/tree_abalone_model.sav", "w");
pickle.dump(tree_abalone_model, io);
close(io);

In [None]:
# Support Vector Machine
svm_abalone_model = SVC();
@time fit!(svm_abalone_model, abalone_X_train, abalone_y_train);
@time svm_abalone_predictions = predict(svm_abalone_model, abalone_X_test);
svm_abalone_accuracy = accuracy_score(svm_abalone_predictions, abalone_y_test);
println("Support Vector Machine Accuracy: $(round(svm_abalone_accuracy * 100, digits=1))%");
io = open("./model/svm_abalone_model.sav", "w");
pickle.dump(svm_abalone_model, io);
close(io);

In [None]:
# Neural Network
nn_abalone_model = MLPClassifier(max_iter=1000);
@time fit!(nn_abalone_model, abalone_X_train, abalone_y_train);
@time nn_abalone_predictions = predict(nn_abalone_model, abalone_X_test);
nn_abalone_accuracy = accuracy_score(nn_abalone_predictions, abalone_y_test);
println("Neural Network Accuracy: $(round(nn_abalone_accuracy * 100, digits=1))%");
io = open("./model/nn_abalone_model.sav", "w");
pickle.dump(nn_abalone_model, io);
close(io);

# Madelon Data Set
## ETL

In [None]:
# Read madelon data csv into DataFrame
madelon_test_df = CSV.File("./data/madelon/madelon_test.data") |> DataFrame;
madelon_train_df = CSV.File("./data/madelon/madelon_train.data") |> DataFrame;
madleon_valid_df = CSV.File("./data/madelon/madelon_valid.data") |> DataFrame;
madelon_train_labels = CSV.File("./data/madelon/madelon_train.labels") |> DataFrame;

## Preprocessing

In [None]:
# Merge training features with training labels
madelon_train_df[!,501] = madelon_train_labels[!,1];

In [None]:
# Feature and label extraction
madelon_X_features = convert(Array, madelon_train_df[:,1:500]);
madelon_y_labels = convert(Array, madelon_train_df[:, 501]);

In [None]:
# Drop irrelevant features to avoid overfitting
madelon_X_features = SelectKBest(chi2, k=20).fit_transform(madelon_X_features, madelon_y_labels);

In [None]:
# Split data into testing and training subsets
madelon_X_train, madelon_X_test, madelon_y_train, madelon_y_test = train_test_split(madelon_X_features, madelon_y_labels, test_size=0.3);

## Model Testing

In [None]:
# Naive Bayesian Classifier
nb_madelon_model = GaussianNB()
@time fit!(nb_madelon_model, madelon_X_train, madelon_y_train)
@time nb_madelon_predictions = predict(nb_madelon_model, madelon_X_test)
nb_madelon_accuracy = accuracy_score(nb_madelon_predictions, madelon_y_test)
println("Naive Bayesian Classifier Accuracy: $(round(nb_madelon_accuracy * 100, digits=1))%")
io = open("./model/nb_madelon_model.sav", "w")
pickle.dump(nb_madelon_model, io)
close(io)

In [None]:
# Decision Tree Classifier
tree_madelon_model = DecisionTreeClassifier()
@time fit!(tree_madelon_model, madelon_X_train, madelon_y_train)
@time tree_madelon_predictions = predict(tree_madelon_model, madelon_X_test)
tree_madelon_accuracy = accuracy_score(tree_madelon_predictions, madelon_y_test)
println("Decision Tree Classifier Accuracy: $(round(tree_madelon_accuracy * 100, digits=1))%")
io = open("./model/tree_madelon_model.sav", "w")
pickle.dump(tree_madelon_model, io)
close(io)

In [None]:
# Support Vector Machine
svm_abalone_model = SVC()
@time fit!(svm_abalone_model, madelon_X_train, madelon_y_train)
@time svm_abalone_predictions = predict(svm_abalone_model, madelon_X_test)
svm_abalone_accuracy = accuracy_score(svm_abalone_predictions, madelon_y_test)
println("Support Vector Machine Accuracy: $(round(svm_abalone_accuracy * 100, digits=1))%")
io = open("./model/svm_abalone_model.sav", "w")
pickle.dump(svm_abalone_model, io)
close(io)

In [None]:
# Neural Network
nn_abalone_model = MLPClassifier(max_iter=1000)
@time fit!(nn_abalone_model, madelon_X_train, madelon_y_train)
@time nn_abalone_predictions = predict(nn_abalone_model, madelon_X_test)
nn_abalone_accuracy = accuracy_score(nn_abalone_predictions, madelon_y_test)
println("Neural Network Accuracy: $(round(nn_abalone_accuracy * 100, digits=1))%")
io = open("./model/nn_abalone_model.sav", "w")
pickle.dump(nn_abalone_model, io)
close(io)

# KDD Data Set
## ETL

In [None]:
kdd_df = CSV.File("./data/kddcup.data") |> DataFrame;
first(kdd_df, 5)

## Preprocessing

In [None]:
kdd_df[!,2] = le.fit_transform(kdd_df[!,2])
kdd_df[!,3] = le.fit_transform(kdd_df[!,3])
kdd_df[!,4] = le.fit_transform(kdd_df[!,4])
first(kdd_df, 5)

In [None]:
kdd_X_features = convert(Array, kdd_df[!,1:41]);
kdd_y_labels = convert(Array, kdd_df[!,42]);

In [None]:
kdd_X_train, kdd_X_test, kdd_y_train, kdd_y_test = train_test_split(kdd_X_features, kdd_y_labels, test_size=0.3, random_state=42);

## Model Testing

In [None]:
# Naive Bayesian Classifier
nb_kdd_model = GaussianNB()
@time fit!(nb_kdd_model, kdd_X_train, kdd_y_train)
@time nb_kdd_preditions = predict(nb_kdd_model, kdd_X_test)
nb_kdd_accuracy = accuracy_score(nb_kdd_preditions, kdd_y_test)
println("Naive Bayesian Classifier Accuracy: $(round(nb_kdd_accuracy * 100, digits=1))%")
io = open("./model/nb_kdd_model.sav", "w")
pickle.dump(nb_kdd_model, io)
close(io)

In [None]:
# Decision Tree Classifier
tree_kdd_model = DecisionTreeClassifier()
@time fit!(tree_kdd_model, kdd_X_train, kdd_y_train)
@time tree_kdd_predictions = predict(tree_kdd_model, kdd_X_test)
tree_kdd_accuracy = accuracy_score(tree_kdd_predictions, kdd_y_test)
println("Decision Tree Classifier Accuracy: $(round(tree_kdd_accuracy * 100, digits=1))%")
io = open("./model/tree_kdd_model.sav", "w")
pickle.dump(tree_kdd_model, io)
close(io)

In [None]:
# Support Vector Machine
svm_kdd_model = SVC()
@time fit!(svm_kdd_model, madelon_X_train, madelon_y_train)
@time svm_kdd_predictions = predict(svm_kdd_model, madelon_X_test)
svm_kdd_accuracy = accuracy_score(svm_kdd_predictions, madelon_y_test)
println("Support Vector Machine Accuracy: $(round(svm_kdd_accuracy * 100, digits=1))%")
io = open("./model/svm_kdd_model.sav", "w")
pickle.dump(svm_kdd_model, io)
close(io)

In [None]:
# Neural Network
nn_kdd_model = MLPClassifier(max_iter=1000)
@time fit!(nn_kdd_model, madelon_X_train, madelon_y_train)
@time nn_kdd_predictions = predict(nn_kdd_model, madelon_X_test)
nn_kdd_accuracy = accuracy_score(nn_kdd_predictions, madelon_y_test)
println("Neural Network Accuracy: $(round(nn_kdd_accuracy * 100, digits=1))%")
io = open("./model/nn_kdd_model.sav", "w")
pickle.dump(nn_kdd_model, io)
close(io)