In [14]:

# 本章我们使用MNIST数据库，有70000个手写图像的集合，每个图像有个它表示的数字标签，这个数据集也被称为机器学习的hello world



# Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble
from sklearn import discriminant_analysis, gaussian_process
from xgboost import XGBClassifier


# Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection, model_selection, metrics


# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix


# data minipulate
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_mldata
import warnings

#Configure Virusalization Defaults
color = sns.color_palette()


warnings.filterwarnings("ignore")
np.random.seed(42)


%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12, 8
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999


  return f(*args, **kwds)


In [16]:
try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_mldata('MNIST original',data_home='./datasets')
    mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings
    sort_by_target(mnist) # fetch_openml() returns an unsorted dataset
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original',data_home='./datasets')
mnist["data"], mnist["target"]

X, y = mnist['data'], mnist['target']

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(60000)
X_train, X_test = X_train[shuffle_index], y_train[shuffle_index]

In [30]:
# Machine Learining Algorithm (MLA) Selection and Initialization
MLA  = [
    # Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),
    
    # Gaussian Process
    gaussian_process.GaussianProcessClassifier(),
    
    # GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    # Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    # Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    # SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    # Trees
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(), 
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),
    
    XGBClassifier(),
]

In [31]:

def run_MLA(mla, Xtrain, ytrain):
    
    cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.6, random_state=42)
    MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA  Test Accuracy 3*STD', 'MLA Time']
    MLA_compare = pd.DataFrame(columns=MLA_columns)
    MLA_predict = pd.DataFrame({'Target':ytrain})
    
    row_index = 0
    for i, alg in enumerate(mla):
        print(i,alg.__class__.__name__)
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())

        cv_results = model_selection.cross_validate(alg, Xtrain, ytrain, cv=cv_split, return_train_score=True)
        MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
        MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
        MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
        MLA_compare.loc[row_index, 'MLA test Accuracy Mean'] = cv_results['test_score'].std()*3
        alg.fit(X_train, y_train)
        MLA_predict[MLA_name] = alg.predict(Xtrain)
        row_index += 1

    MLA_compare.sort_values(by=['MLA Test Accuracy Mean'], ascending=False, inplace=True)
    return MLA_compare, MLA_predict

In [None]:
# cross val : [0.97144557 0.96846364 0.96606739 0.96725021 0.97236779]
# data augmented : [0.98378571 0.9842619  0.98407143 0.98261905 0.98540476]
# data noise: [0.98221429 0.98111905 0.9822619  0.98316667 0.98216667]
run_MLA(MLA, X_train[:10000], y_train[:10000])

0 AdaBoostClassifier
1 BaggingClassifier
2 ExtraTreesClassifier
3 GradientBoostingClassifier
4 RandomForestClassifier
5 GaussianProcessClassifier


In [None]:
sns.barplot(x='MLA Test Accuracy Mean', y='MLA Name', data=MLA_compare, color='m')
plt.title('Machine Learn Algorithm Accuracy Score\n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')

In [None]:
def correlation_heatmap(df):
    _, ax = plt.subplots(figsize=(14, 12))
    colormap = sns.diverging_palette(220, 10, as_cmap=True)
    _ = sns.heatmap(
        df.corr(),
        cmap=colormap,
        square=True,
        ax=ax,
        annot=True,
        linewidths=0.1,
        vax=1.0, linecolor='white',
        annot_kws={'fontsize':12}
    )
    plt.title('Pearson Correlation of Feature', y=0.05, size=15)
correlation_heatmap(X_train[:1000])