In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
import pyspark

conf = pyspark.SparkConf()
conf.setAppName('Sample machine learning')
conf.set("spark.cores.max", "16")
conf.set("spark.yarn.executor.memoryOverhead", "0")
conf.set("spark.yarn.executor.memory", "512M")
conf.set("spark.yarn.driver.memory", "512M")
conf.set("spark.submit.deployMode", "client")
# sc.stop()
# sc = pyspark.SparkContext(conf=conf)
sc = pyspark.SparkContext.getOrCreate(conf=conf)

In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import metrics

from sklearn.preprocessing import MinMaxScaler

import numpy as np
import seaborn as sb
from time import time

from pmlb import fetch_data, classification_dataset_names

print('Total: {}'.format(len(classification_dataset_names)))

clf_list = [
    AdaBoostClassifier(),
    BaggingClassifier(),
    BernoulliNB(alpha=.01),
    DecisionTreeClassifier(max_depth=5),
    ExtraTreeClassifier(max_depth=5),
    ExtraTreesClassifier(max_depth=5),
    RidgeClassifier(tol=1e-2, solver="lsqr"),
    PassiveAggressiveClassifier(max_iter=50),
    Perceptron(max_iter=50),
    GaussianNB(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
    RandomForestClassifier(max_depth=5, n_estimators=100, max_features=1),
    MLPClassifier(alpha=1),
    QuadraticDiscriminantAnalysis(),
    LinearSVC(penalty='l2', dual=False, tol=1e-3),
    SGDClassifier(alpha=.0001, max_iter=50, penalty='l2'),
    SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet"),
    NearestCentroid(),
    MultinomialNB(alpha=.01)]

def benchmark(clf):
    t0 = time()
    clf.fit(X_train_scaled, train_y)
    train_time = float("{0:.2f}".format(time() - t0))

    t0 = time()
    pred = clf.predict(X_test_scaled)
    test_time = float("{0:.2f}".format(time() - t0))

    score = metrics.accuracy_score(test_y, pred)
    score = float("{0:.2f}".format(score))
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

number_of_datasets = len(classification_dataset_names)
number_of_clfs = len(clf_list)
results_list = []

start_index = 127
for i, classification_dataset in enumerate(classification_dataset_names[start_index : number_of_datasets]):
    print(classification_dataset)
    for j, clf in enumerate(clf_list[0 : number_of_clfs]):
        if classification_dataset == 'mnist':
            continue
            
        if classification_dataset == 'adult':
            continue
            
        if start_index == 91: #and j == 16:
            continue
            
        if start_index == 72 and j == 13:
            continue
            
        if start_index >= 100 and j == 16:
            continue
        if start_index == 126:# and j == 12:
            continue

        X, y = fetch_data(classification_dataset, return_X_y=True, local_cache_dir='./data')
        train_X, test_X, train_y, test_y = train_test_split(X, y)
        scaler = MinMaxScaler()
        scaler.fit(train_X)
        X_train_scaled = scaler.transform(train_X)
        X_test_scaled = scaler.transform(test_X)
        results = benchmark(clf)
        print('{}, {}, {}, {}, {}'.format(start_index, j, results[1], results[2], results[3]))
        #dataset index, clf index, score, train_time, test_time
        res_str = start_index, j, results[1], results[2], results[3]
        results_list.append(list(res_str))
    start_index = start_index + 1
#         print('.', end='')
    print('')

np_results_list = np.array(results_list)
np.savetxt("out.txt", np_results_list, delimiter=",", fmt='% 1d, % 2d, %1.3f, %1.3f, %1.3f')


Total: 166
postoperative-patient-data
127, 0, 0.64, 0.33, 0.03
127, 1, 0.59, 0.06, 0.01
127, 2, 0.82, 0.02, 0.0
127, 3, 0.55, 0.0, 0.0
127, 4, 0.55, 0.0, 0.0
127, 5, 0.77, 0.06, 0.01
127, 6, 0.73, 0.21, 0.0
127, 7, 0.77, 0.0, 0.0
127, 8, 0.77, 0.0, 0.0
127, 9, 0.68, 0.0, 0.0
127, 10, 0.59, 0.27, 0.0
127, 11, 0.64, 0.01, 0.0
127, 12, 0.68, 0.0, 0.0
127, 13, 0.68, 0.0, 0.0
127, 14, 0.77, 0.54, 0.04
127, 15, 0.68, 0.85, 0.0
127, 17, 0.59, 0.0, 0.0
127, 18, 0.68, 0.0, 0.0
127, 19, 0.68, 0.01, 0.0
127, 20, 0.68, 0.0, 0.0
127, 21, 0.68, 0.0, 0.0

prnn_crabs
128, 0, 0.84, 0.4, 0.03
128, 1, 0.96, 0.06, 0.01
128, 2, 0.46, 0.0, 0.0
128, 3, 0.94, 0.0, 0.0
128, 4, 0.72, 0.0, 0.0
128, 5, 0.78, 0.06, 0.01
128, 6, 0.96, 0.01, 0.0
128, 7, 1.0, 0.01, 0.0
128, 8, 1.0, 0.01, 0.0
128, 9, 0.62, 0.0, 0.0
128, 10, 0.86, 0.34, 0.0
128, 11, 0.96, 0.0, 0.0
128, 12, 0.48, 0.01, 0.0
128, 13, 0.94, 0.01, 0.0
128, 14, 0.9, 0.56, 0.04
128, 15, 0.92, 1.31, 0.0
128, 17, 0.98, 0.0, 0.0
128, 18, 0.98, 0.01, 0.0
128, 19,

142, 9, 0.65, 0.01, 0.0
142, 10, 0.81, 0.99, 0.0
142, 11, 0.73, 0.0, 0.01
142, 12, 0.77, 0.01, 0.0
142, 13, 0.77, 0.02, 0.01
142, 14, 0.77, 0.58, 0.04
142, 15, 0.87, 4.23, 0.0
142, 17, 0.77, 0.01, 0.0
142, 18, 0.88, 0.01, 0.0
142, 19, 0.79, 0.04, 0.0
142, 20, 0.62, 0.0, 0.0
142, 21, 0.81, 0.0, 0.0

soybean
143, 0, 0.36, 0.57, 0.09
143, 1, 0.94, 0.12, 0.01
143, 2, 0.31, 0.01, 0.0
143, 3, 0.78, 0.01, 0.0
143, 4, 0.62, 0.0, 0.0
143, 5, 0.75, 0.06, 0.01
143, 6, 0.88, 0.21, 0.0
143, 7, 0.83, 0.38, 0.0
143, 8, 0.87, 0.3, 0.0
143, 9, 0.83, 0.01, 0.01
143, 10, 0.95, 14.0, 0.03
143, 11, 0.88, 0.01, 0.05
143, 12, 0.48, 0.11, 0.03
143, 13, 0.82, 0.33, 0.04
143, 14, 0.81, 0.62, 0.07
143, 15, 0.89, 14.41, 0.01
143, 17, 0.92, 0.2, 0.0
143, 18, 0.88, 0.33, 0.0
143, 19, 0.88, 1.19, 0.0
143, 20, 0.85, 0.01, 0.0
143, 21, 0.57, 0.01, 0.0

spambase
144, 0, 0.93, 2.76, 0.11
144, 1, 0.93, 2.64, 0.05
144, 2, 0.89, 0.03, 0.01
144, 3, 0.9, 0.15, 0.0
144, 4, 0.75, 0.02, 0.0
144, 5, 0.75, 0.2, 0.01
144, 6, 0.89,

158, 0, 0.85, 2.02, 0.12
158, 1, 0.82, 2.56, 0.03
158, 2, 0.31, 0.01, 0.0
158, 3, 0.76, 0.15, 0.0
158, 4, 0.61, 0.01, 0.0
158, 5, 0.81, 0.13, 0.01
158, 6, 0.87, 0.05, 0.0
158, 7, 0.86, 0.52, 0.0
158, 8, 0.85, 0.44, 0.0
158, 9, 0.82, 0.02, 0.01
158, 10, 0.84, 16.59, 0.04
158, 11, 0.81, 0.03, 1.82
158, 12, 0.86, 2.57, 0.62
158, 13, 0.86, 2.41, 0.73
158, 14, 0.81, 1.41, 0.13
158, 15, 0.87, 36.87, 0.03
158, 17, 0.86, 0.26, 0.0
158, 18, 0.84, 0.47, 0.0
158, 19, 0.88, 1.13, 0.0
158, 20, 0.82, 0.01, 0.0
158, 21, 0.82, 0.01, 0.0

waveform-40
159, 0, 0.82, 3.54, 0.13
159, 1, 0.81, 5.28, 0.04
159, 2, 0.35, 0.02, 0.0
159, 3, 0.76, 0.29, 0.0
159, 4, 0.63, 0.02, 0.0
159, 5, 0.8, 0.18, 0.02
159, 6, 0.86, 0.07, 0.0
159, 7, 0.85, 0.78, 0.0
159, 8, 0.73, 0.63, 0.0
159, 9, 0.79, 0.03, 0.01
159, 10, 0.85, 32.5, 0.04
159, 11, 0.78, 0.06, 3.92
159, 12, 0.85, 4.73, 0.98
159, 13, 0.85, 5.94, 1.59
159, 14, 0.79, 1.41, 0.13
159, 15, 0.87, 34.15, 0.05
159, 17, 0.86, 0.4, 0.0
159, 18, 0.85, 0.68, 0.0
159, 19, 0.