In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.preprocessing import RobustScaler
from time import time

In [2]:
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv ' , sep = ';')

In [3]:
features = df.drop('quality', 1).values
def isGood(quality):
    if quality >= 6:
        return 1
    else:
        return 0
df['binary_quality'] = df['quality'].apply(isGood)
label = df['binary_quality'].values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42, stratify=label)

In [5]:
# Now we rescale the feature values
scaler = RobustScaler()
# Fit only to the training data, then apply the transformations to the data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
def random_grid_search_result(random_search, n_iter_search):
    start = time()
    random_search.fit(X_train, y_train)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
    print(random_search.best_score_)
    print(random_search.best_params_)
    best_clf = random_search.best_estimator_
    best_clf.fit(X_train, y_train)
    predicted = best_clf.predict(X_test)
    print("Accuracy "+ " : "+str(np.mean(predicted == y_test)))

In [None]:
def random_grid_search_knn():
    # Distribution of hyperparameter
    param_dist = {"n_neighbors": list(range(1, 31)),
                  "weights": ['uniform', 'distance'],
                  "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
                  "leaf_size":list(range(15, 45)),
                  "metric":['euclidean', 'manhattan', 'chebyshev'],
                  "n_jobs": [-1, 1]
                 }
    clf = KNeighborsClassifier()
    n_iter_search = 60
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, random_state=42)
    random_grid_search_result(random_search, n_iter_search)
    
random_grid_search_knn()

In [None]:
def random_grid_search_svm():
    # Distribution of hyperparameter
    param_dist = {'C': 10. ** np.arange(-3, 8), 
                  'gamma': 10. ** np.arange(-5, 4)
                 }
    clf = SVC()
    n_iter_search = 60
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, random_state=42)
    random_grid_search_result(random_search, n_iter_search)
    
random_grid_search_svm()

In [None]:
def random_grid_search_decision_tree():
    # Distribution of hyperparameter
    param_dist = {"criterion": ["gini", "entropy"],
                  "min_samples_split": list(range(2, 80)),
                  "max_depth": [None, 1, 5, 10, 1000, 1500],
                  "min_samples_leaf": [1, 5, 10],
                  "max_leaf_nodes": [None, 5, 10, 20, 100, 1000],
                 }
    clf = DecisionTreeClassifier(random_state=42)
    n_iter_search = 60
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, random_state=42)
    random_grid_search_result(random_search, n_iter_search)
    
random_grid_search_decision_tree()

In [12]:
def random_grid_search_ann():
    # Distribution of hyperparameter
    param_dist = {'hidden_layer_sizes':np.arange(5, 12),
                  'activation' : ['identity', 'logistic', 'tanh', 'relu'],
                  'solver' : ['lbfgs', 'sgd', 'adam'],
                  'learning_rate_init':[0.005,0.05,0.001],
                  'max_iter': [500,1000,1500], 
                  'alpha': 10.0 ** -np.arange(1, 7),
                 }
    clf = MLPClassifier(random_state=42)
    n_iter_search = 60
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)
    random_grid_search_result(random_search, n_iter_search)
    
random_grid_search_ann()

RandomizedSearchCV took 49.27 seconds for 60 candidates parameter settings.
0.755277560594
{'solver': 'adam', 'max_iter': 1000, 'learning_rate_init': 0.05, 'hidden_layer_sizes': 5, 'alpha': 0.10000000000000001, 'activation': 'logistic'}
Accuracy  : 0.75


In [7]:
dict_classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(),
    "SVC":SVC(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "Neural Net": MLPClassifier(random_state=42)
}

In [8]:
for name, classifier in dict_classifiers.items():
    start_training = time()
    clf = classifier.fit(X_train, y_train)
    end_training = time()
    start_testing = time()
    predicted = clf.predict(X_test)
    end_testing = time()
    print("Accuracy of "+ name + " : "+str(np.mean(predicted == y_test)))
    print("Training time of "+ name + " : "+str(end_training - start_training))
    print("Testing time of "+ name + " : "+str(end_testing - start_testing))

Accuracy of Nearest Neighbors : 0.725
Training time of Nearest Neighbors : 0.0031719207763671875
Testing time of Nearest Neighbors : 0.010621786117553711
Accuracy of SVC : 0.74375
Training time of SVC : 0.05344796180725098
Testing time of SVC : 0.0069789886474609375
Accuracy of Decision Tree : 0.75625
Training time of Decision Tree : 0.008275985717773438
Testing time of Decision Tree : 0.0001628398895263672
Accuracy of Naive Bayes : 0.721875
Training time of Naive Bayes : 0.0019431114196777344
Testing time of Naive Bayes : 0.0003120899200439453
Accuracy of Neural Net : 0.74375
Training time of Neural Net : 1.3418009281158447
Testing time of Neural Net : 0.0009109973907470703




In [None]:
# Histogram
pd.DataFrame.hist(df, figsize = [15,15]);

In [None]:
# Correlation graph
plt.style.use('ggplot')
correlation_matrix = df.corr()
plt.figure(figsize=(10,8))
ax = sns.heatmap(correlation_matrix, vmax=1, square=True,annot=True,cmap='coolwarm')
plt.show()

In [None]:
%%time
from sklearn import neighbors, linear_model
knn = neighbors.KNeighborsClassifier(n_neighbors = 3)
knn_model = knn.fit(X_train, y_train)
print('k-NN accuracy for test set: %f' % knn_model.score(X_test, y_test))

In [None]:
%%time
# SVM
from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
print('SVM accuracy for test set: %f' % svm_model.score(X_test, y_test))

In [None]:
X_train

In [None]:
%%time
from sklearn import neighbors, linear_model

knn = neighbors.KNeighborsClassifier(n_neighbors = 3)
knn_model = knn.fit(X_train, y_train)
print('k-NN accuracy for test set after rescaling: %f' % knn_model.score(X_test, y_test))

In [None]:
from sklearn.metrics import classification_report
y_true, y_pred = y_test, knn_model.predict(X_test)
print(classification_report(y_true, y_pred))

In [None]:
%%time
# SVM
from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
print('Linear SVM accuracy for test set after rescaling: %f' % svm_model.score(X_test, y_test))

In [None]:
from sklearn.metrics import classification_report
y_true, y_pred = y_test, svm_model.predict(X_test)
print(classification_report(y_true, y_pred))

In [None]:
%%time
# Nonlinear SVM
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import LinearSVC

polynomial_svm_model = Pipeline((
    ("poly_features", PolynomialFeatures(degree=3)),
    ("scaler", StandardScaler()),
    ("svm_clf",LinearSVC(C=10, loss="hinge"))))
polynomial_svm_model.fit(X_train, y_train)
print('Nonlinear SVM accuracy for test set: %f' % polynomial_svm_model.score(X_test, y_test))

In [None]:
from sklearn.metrics import classification_report
y_true, y_pred = y_test, polynomial_svm_model.predict(X_test)
print(classification_report(y_true, y_pred))

In [None]:
%%time
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(max_depth=2)
tree_model.fit(X_train, y_train)
print('Decision Tree accuracy for test set: %f' % tree_model.score(X_test, y_test))

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(tree_model,
                out_file=None,
                class_names=['0','1'])

In [None]:
# Plot the relationship between tree depths and prediction power
depth = []
accuracy = []
for x in range(1, 200):
    depth.append(x)
    tree_model = DecisionTreeClassifier(max_depth=x)
    tree_model.fit(X_train, y_train)
    accu = tree_model.score(X_test, y_test)
    accuracy.append(accu)
    plt.xlabel("Depth", fontsize=14)
    plt.ylabel("Accuracy", fontsize=14)
plt.plot(depth, accuracy)
plt.show()

In [None]:
%%time
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print('Naive Bayes accuracy for test set: %f' % nb_model.score(X_test, y_test))

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
print('Naive Bayes accuracy for test set: %f' % nb_model.score(X_test, y_test))

In [None]:
from sklearn.naive_bayes import BernoulliNB
nb_model = BernoulliNB()
nb_model.fit(X_train, y_train)
print('Naive Bayes accuracy for test set: %f' % nb_model.score(X_test, y_test))

In [None]:
%%time
# Neural Network
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
mlp_model.fit(X_train, y_train)
print('Neural Network accuracy for test set: %f' % mlp_model.score(X_test, y_test))

In [None]:
%%time
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix

mlp_model = MLPClassifier(hidden_layer_sizes=(10,10),solver='sgd',learning_rate_init=0.1,max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)
print('Neural Network accuracy for test set: %f' % mlp_model.score(X_test, y_test))

In [None]:
# Left out


def random_grid_search_naive_bayes():
    # Distribution of hyperparameter
    param_dist = {'priors':[[0.1,0.9], [0.9,0.1]]
                 }
    clf = GaussianNB()
    n_iter_search = 2
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)
    random_grid_search_result(random_search, n_iter_search)
    
random_grid_search_naive_bayes()