In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_regions(X, y, classifier, resolution = 0.02):
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    
    #plot surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    
    # plot class samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.9, c=cmap(idx), marker=markers[idx], label = cl)
        
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
sc = StandardScaler()

X_train, X_valid, y_train, y_valid, expl_variables, _ = get_train_valid_sets(model_frames[2], animal='both', include_svd=False)
print expl_variables
X_train_std = sc.fit_transform(X_train)
X_valid_std = sc.transform(X_valid)
lr = LogisticRegression(random_state=22, multi_class='multinomial', solver='lbfgs')
X_train_pca = pca.fit_transform(X_train_std)
X_valid_pca = pca.transform(X_valid_std)
lr.fit(X_train_pca, y_train)
plot_decision_regions(X_valid_pca[:200,:], y_valid[:200], classifier=lr)
plt.xlabel('PC1')
plt.xlabel('PC2')
plt.legend(loc = 'lower left')

lr.score(X_valid_pca, y_valid)

In [None]:
# plot roc-auc 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

from sklearn.preprocessing import label_binarize

# get set most suited for RF
X_train, X_valid, y_train, y_valid, expl_variables, classes = get_train_valid_sets(model_frames[3], 'both')

y_valid_bin = label_binarize(y_valid, classes=[0,1,2,3,4])
gs_rf = gs_rf.fit(X_train, y_train)
roc_predictions = gs_rf.predict(X_valid)
roc_prob = gs_rf.predict_proba(X_valid)


In [None]:
# Perform nested cross validation to find out generalised error used hyperparameter tuned Log Reg
pipe_lr = Pipeline([('min_max', MinMaxScaler()),
                    ('clf', LogisticRegression(random_state=22))])

param_range = [1.0, 10.0, 100.0]
param_grid = [{'clf__multi_class':['ovr'],
               'clf__solver':['liblinear'],
               'clf__class_weight':[None,'balanced'],
               'clf__C':param_range},
              {'clf__multi_class':['multinomial'],
               'clf__solver':['sag'],
               'clf__C':param_range}]

gs = GridSearchCV(estimator=pipe_lr, param_grid=param_grid, scoring='neg_log_loss', cv=2)
# get set most suited for LR
model_df = get_features(df, mode=3)
X_train, X_valid, y_train, y_valid, expl_variables, _ = get_train_valid_sets(model_frames[2], 'both')
print expl_variables

scores = cross_val_score(gs, X_train, y_train, scoring='neg_log_loss', verbose=True)
print "Cross val Logistic Regression log loss: %.3f +/- %.3f" % (np.mean(scores), np.std(scores))

# Cross val log loss: -0.812 +/- 0.010

# Cross val Logistic Regression log loss: -0.888 +/- 0.012 # with et's age categories

In [None]:
gs = gs.fit(X_train, y_train)
valid_prediction = gs.predict(X_valid)
valid_prediction_prob = gs.predict_proba(X_valid)
print(classification_report(y_valid, valid_prediction, target_names=classes))
print log_loss(y_valid, valid_prediction_prob)