<a href="https://colab.research.google.com/github/kmlporto/machinelearning/blob/dev-teste/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import math
import statistics
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from collections import Counter
import arff
from sklearn.model_selection import cross_val_score
from sklearn import datasets
from sklearn.model_selection import StratifiedKFold

In [0]:

def cross_validation(X, y, classifier):
    skf = StratifiedKFold(n_splits=5)
    acc_sum = 0
    for train, test in skf.split(X, y):
        acc_sum += get_accuracy(X.loc[train, :], X.loc[test, :], y[train], y[test], classifier)
    return round(acc_sum / 5)

def get_accuracy(x_train, x_test, y_train, y_test, classifier):    
    classifier = classifier.fit(x_train, y_train)
    result = classifier.predict(x_test)
    acc = metrics.accuracy_score(result, y_test)
    return acc * 100


In [5]:
# ========== ABALONE ===========
url_abalone = "https://raw.githubusercontent.com/kmlporto/machinelearning/master/database/abalone.data"
col_names_abalone = ['label', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8']
feature_cols_abalone = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8']

abalone_results = {
    'tree': 0,
    '5nn': 0,
    '10nn': 0,
    'mlp': 0,
    'mlp2': 0,
    'kmeans': 0,
}


dataset_abalone = pd.read_csv(url_abalone, header=None, names=col_names_abalone)

x = dataset_abalone[feature_cols_abalone]
y = dataset_abalone.label

# TREE DECISION
model_tree = tree.DecisionTreeClassifier(criterion="entropy")
abalone_results['tree'] = cross_validation(x, y, model_tree)

# 5-NN
model_5nn = KNeighborsClassifier(n_neighbors=5, metric='euclidean', algorithm='brute')
abalone_results['5nn'] = cross_validation(x, y, model_5nn)

# 10-NN
model_10nn = KNeighborsClassifier(n_neighbors=10, metric='euclidean', algorithm='brute')
abalone_results['10nn'] = cross_validation(x, y, model_10nn)

# MLP
model_mlp = MLPClassifier(hidden_layer_sizes=(4, 2), activation='tanh',max_iter=2000)
abalone_results['mlp'] = cross_validation(x, y, model_mlp)

# MLP2
model_mlp2 = MLPClassifier(hidden_layer_sizes=(7, 3), activation='tanh',max_iter=2000)
abalone_results['mlp2'] = cross_validation(x, y, model_mlp2)

print(abalone_results)

{'tree': 49.0, '5nn': 52.0, '10nn': 52.0, 'mlp': 54.0, 'mlp2': 54.0, 'kmeans': 0}


In [6]:
# =========== WINE =============
url_wine = "https://raw.githubusercontent.com/kmlporto/machinelearning/master/database/wine.data"
col_names_wine = ['label', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']
feature_cols_wine = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']

wine_results = {
    'tree': 0,
    '5nn': 0,
    '10nn': 0,
    'mlp': 0,
    'mlp2': 0,
    'kmeans': 0
}

dataset_wine = pd.read_csv(url_wine, header=None, names=col_names_wine)

x = dataset_wine[feature_cols_wine]
y = dataset_wine.label

# TREE DECISION
model_tree = tree.DecisionTreeClassifier(criterion="entropy")
wine_results['tree'] = cross_validation(x, y, model_tree)

# 5-NN
model_5nn = KNeighborsClassifier(n_neighbors=5, metric='euclidean', algorithm='brute')
wine_results['5nn'] = cross_validation(x, y, model_5nn)

# 10-NN
model_10nn = KNeighborsClassifier(n_neighbors=10, metric='euclidean', algorithm='brute')
wine_results['10nn'] = cross_validation(x, y, model_10nn)

# MLP
model_mlp = MLPClassifier(hidden_layer_sizes=(10, 5), activation='tanh',max_iter=2000)
wine_results['mlp'] = cross_validation(x, y, model_mlp)

# MLP
model_mlp2 = MLPClassifier(hidden_layer_sizes=(6, 3), activation='tanh',max_iter=2000)
wine_results['mlp2'] = cross_validation(x, y, model_mlp2)

print(wine_results)


{'tree': 92.0, '5nn': 69.0, '10nn': 69.0, 'mlp': 60.0, 'mlp2': 61.0, 'kmeans': 0}


In [12]:
# =========== WEBSITE PHISHING ============

url_phishing = "https://raw.githubusercontent.com/kmlporto/machinelearning/master/database/PhishingData.arff"
col_names_phishing = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'label']
feature_cols_phishing = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9']

phishing_results = {
    'tree': 0,
    '5nn': 0,
    '10nn': 0,
    'mlp': 0,
    'mlp2': 0,
    'kmeans': 0,
}

list_phishing = list()
for row in arff.load(url_phishing):
    list_phishing.append(list(row))
dataset_phishing = pd.DataFrame(list_phishing, columns=col_names_phishing)

x = dataset_phishing[feature_cols_phishing]
y = dataset_phishing.label

# TREE DECISION
model_tree = tree.DecisionTreeClassifier(criterion="entropy")
phishing_results['tree'] = cross_validation(x, y, model_tree)

# 5-NN
model_5nn = KNeighborsClassifier(n_neighbors=5, metric='euclidean', algorithm='brute')
phishing_results['5nn'] = cross_validation(x, y, model_5nn)

# 10-NN
model_10nn = KNeighborsClassifier(n_neighbors=10, metric='euclidean', algorithm='brute')
phishing_results['10nn'] = cross_validation(x, y, model_10nn)

# MLP
model_mlp = MLPClassifier(hidden_layer_sizes=(4, 2), activation='tanh',max_iter=2000)
phishing_results['mlp'] = cross_validation(x, y, model_mlp)

# MLP2
model_mlp2 = MLPClassifier(hidden_layer_sizes=(10, 5), activation='tanh',max_iter=2000)
phishing_results['mlp2'] = cross_validation(x, y, model_mlp2)

print(phishing_results)

FileNotFoundError: ignored