In [1]:
import os, sys
pathname = os.path.dirname("/home/jgutman/mvesc/Models_Results/")
full_pathname = os.path.abspath(pathname)
split_pathname = full_pathname.split(sep="mvesc")
base_pathname = os.path.join(split_pathname[0], "mvesc")
parentdir = os.path.join(base_pathname, "ETL")
sys.path.insert(0,parentdir)

In [2]:
from mvesc_utility_functions import *

In [3]:
# all model import statements
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [4]:
#from sklearn.grid_search import ParameterGrid
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import *
from sklearn.externals import joblib
from sklearn.metrics import precision_recall_curve, roc_curve, confusion_matrix
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler

import yaml
import numpy as np
import pandas as pd

In [5]:
query = """select * from model.outcome"""

In [6]:
with postgres_pgconnection_generator() as connection:
        with connection.cursor() as cursor:
            cursor.execute(query)
            results = cursor.fetchall()
            print(len(results))
        connection.commit()

11777


In [7]:
def df2num(rawdf):
    """ Convert data frame with numeric variables and strings to numeric dataframe

    :param pd.dataframe rawdf: raw data frame
    :returns pd.dataframe df: a data frame with strings converted to dummies, other columns unchanged
    :rtype: pd.dataframe
    Rules:
    - 1. numeric columns unchanged;
    - 2. strings converted to dummeis;
    - 3. the most frequent string is taken as reference
    - 4. new column name is: "ColumnName_Category"
    (e.g., column 'gender' with 80 'M' and 79 'F'; the dummy column left is 'gender_F')

    """
    numeric_df = rawdf.select_dtypes(include=[np.number])
    str_columns = [col for col in rawdf.columns if col not in numeric_df.columns]
    dummy_col_df = pd.get_dummies(rawdf[str_columns], dummy_na=True)
    numeric_df = numeric_df.join(dummy_col_df)
    most_frequent_values = rawdf[str_columns].mode().loc[0].to_dict()
    reference_cols = ["{}_{}".format(key, value) for key, value in most_frequent_values.items()]
    numeric_df.drop(reference_cols, axis=1, inplace=True)
    return numeric_df

In [8]:
def define_clfs_params():
    # model_options[model_classes_selected] determines which of these models
    # are actually run, all parameter options in grid run for each selected model

    clfs = {
        'logit': LogisticRegression(),
        'LR_no_penalty': LogisticRegression(C=1e6),
        'DT': DecisionTreeClassifier(),
        'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
        'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
        'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
        'SVM': svm.SVC(kernel='linear', probability=False),
        'GB': GradientBoostingClassifier(
            learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
        'NB': GaussianNB(),
        'SGD': SGDClassifier(loss="hinge", penalty="l2"),
        'KNN': KNeighborsClassifier(n_neighbors=3)
    }

    grid = {
        'logit': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1.0,10.0]},
        'LR_no_penalty': {},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],
            'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100],
            'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'SGD': {'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
        'ET': {'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,
            'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
        'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],
            'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
        'NB' : {},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],
            'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
        'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],
            'algorithm': ['auto','ball_tree','kd_tree']}
    }
    return clfs, grid

In [9]:
def clf_loop(clfs, params, train_X, train_y,
        criterion, models_to_run, cv_folds):
    """
    Returns a dictionary where the keys are model nicknames (strings)
    and the values are GridSearchCV objects containing attributes like
    model.best_score_ and model.best_estimator_

    :param dict(str:estimator) clfs: clfs as returned by define_clfs_params
    :param dict(str:dict) params: grid of classifier hyperparameter options
        to grid search over as returned by define_clfs_params
    :param pandas.DataFrame train_X: index is student_lookup, columns are all
        features to train over in the model
    :param pandas.Series(int) train_y: index is student_lookup, value is 0 or 1
        for outcome label
    :param string criterion: evaluation criterion for model selection on the
        validation set, to be read in from model_options (e.g. 'f1')
    :param list[string] models_to_run: which models to actually run as read in
        from model_options (e.g. ['logit', 'DT'])
    :param sklearn.KFolds cv_folds: a KFolds generator object over the index
        given in train_X and train_y (a list of lists of student_lookups)
    :rtype dict(string: GridSearchCV)
    """
    best_validated_models = dict()
    for index,clf in enumerate([clfs[x] for x in models_to_run]):
        model_name=models_to_run[index]
        print(model_name)
        parameter_values = params[model_name]
        #param_grid = ParameterGrid(parameter_values)
        best_validated_models[model_name] = GridSearchCV(clf, parameter_values, scoring=criterion, cv=cv_folds)
        best_validated_models[model_name].fit(train_X, train_y)

        model_cv_score = best_validated_models[model_name].best_score_
        print("model: {model} cv_score: {score}".format(
            model=model_name, score=model_cv_score))
    return best_validated_models

In [10]:
def temporal_cohort_test_split(joint_df, cohort_grade_level_begin,
    cohorts_held_out, cohorts_training):
    """ Splits the given joint_df of features & outcomes and
    returns a train/test dataset
    :param pd.DataFrame joint_df:
    :param list[int] cohorts_held_out:
    """
    if (cohorts_training=='all'):
        train = joint_df[~joint_df[cohort_grade_level_begin].isin(cohorts_held_out)]
    else:
        train = joint_df[joint_df[cohort_grade_level_begin].isin(cohorts_training)]
    test = joint_df[joint_df[cohort_grade_level_begin].isin(cohorts_held_out)]
    return train, test

In [11]:
def measure_performance(outcomes, predictions):
    """ Returns a dict of model performance objects
    :param list[int] outcomes:
    :param list[float] predictions:
    """
    performance_objects = {}
    performance_objects['pr_curve'] = precision_recall_curve(outcomes, predictions)
    performance_objects['roc_curve'] = roc_curve(outcomes, predictions)
    #performance_objects['confusion_matrix'] = confusion_matrix(outcomes,predictions)
    return performance_objects

In [12]:
def build_outcomes_plus_features(model_options):
    with postgres_pgconnection_generator() as connection:
        # get labeled outcomes
        # Assumes:
        # model.outcome table contains a column (name given in cohort_grade_level_begin) for each cohort base year we choose
        # e.g. 'cohort_9th' contains the year each student is seen in 9th grade
        # and contains an outcome column (name given in outcome_name)
        # and 'student_lookup' columns
        # Usage:
        # select train, validation, and test based on values in column
        # 'cohort_grade_level_begin' according to value in 'cohorts_held_out'
        outcomes_with_student_lookup = read_table_to_df(connection,
            table_name = 'outcome', schema = 'model', nrows = -1,
            columns = ['student_lookup', model_options['outcome_name'], model_options['cohort_grade_level_begin']])
        # drop students without student_lookup, outcome, or cohort identifier
        # can use subset = [colnames] to drop based on NAs in certain columns only
        outcomes_with_student_lookup.dropna(inplace=True)
        joint_label_features = outcomes_with_student_lookup.copy()

        # get all requested input features
        # Assumes:
        # every features table contains 'student_lookup'
        # plus a column for the requested possible features

        for table, column_names in model_options['features_included'].items():
            features = read_table_to_df(connection, table_name = table,
                schema = 'model', nrows = -1,
                columns=(['student_lookup'] + column_names))
        # join to only keep features that have labeled outcomes
            joint_label_features = pd.merge(joint_label_features, features,
                how = 'left', on = 'student_lookup')

    # build dataframe containing student_lookup, outcome, cohort,
    # and all features as numeric non-categorical values
    joint_label_features.set_index('student_lookup', inplace=True)
    joint_label_features = df2num(joint_label_features)
    return joint_label_features

In [13]:
def read_in_yaml(filename=os.path.join(base_pathname,
        'Models_Results', 'model_options.yaml')):
    with open(filename, 'r') as f:
        model_options = yaml.load(f)

    # Maybe we want to have default values for these options and replace
    # from a new yaml file as necessary
    assert(type(model_options) == dict), "bad formatting in yaml file"
    required_keys = set(('validation_criterion', 'features_included', 'cohorts_training',
        'cohorts_held_out', 'file_save_name', 'model_classes_selected', 'outcome_name',
        'cohort_grade_level_begin', 'model_test_holdout', 'random_seed'))
    assert(all([key in model_options.keys() for key in required_keys])), \
        "missing model specifications in yaml file"

    assert(type(model_options['features_included']) == dict), "bad formatting in yaml file"
    assert(type(model_options['model_classes_selected']) == list), "bad formatting in yaml file"
    assert(type(model_options['cohorts_held_out']) == list), "bad formatting in yaml file"
    assert(type(model_options['cohorts_training']) == list or
        model_options['cohorts_training'] == 'all'), "bad formatting in yaml file"
    return model_options

In [14]:
def add_null_dummies(data):
    """
    """
    data_null_columns = data[data.columns[data.isnull().sum() > 0]]
    data_null_dummies = data_null_columns.isnull()*1.0
    data_null_dummies.rename(columns=lambda x: x + '_isnull', inplace=True)
    data_plus_dummies = data.merge(data_null_dummies, left_index=True, right_index=True)
    return data_plus_dummies

def impute_missing_values(train, test, strategy):
    """
    """
    if (strategy=='none'):
        return train, test
        
    elif(strategy == 'mean_plus_dummies' or strategy == 'median_plus_dummies'):
        train = add_null_dummies(train) # add feature_isnull columns 0 or 1
        test = add_null_dummies(test)

        imputer = Imputer(strategy=strategy.split("_")[0])
        imputer.fit(train)
        train = pd.DataFrame(imputer.transform(train), columns = train.columns, index = train.index)
        test = pd.DataFrame(imputer.transform(test), columns = test.columns, index = test.index)
        return train, test

    else:
        print('unknown imputation strategy. try "{}", "{}", or "{}"'.format(
            'mean_plus_dummies', 'median_plus_dummies', 'none'))
        return train, test

In [15]:
def scale_features(train, test, strategy):
    """
    """
    num_values_by_column = {x: len(train[x].unique()) for x in train.columns}
    zero_variance_columns = [k for k,v in num_values_by_column.items() if v == 1]
    train.drop(zero_variance_columns, axis=1, inplace=True)
    test.drop(zero_variance_columns, axis=1, inplace=True)

    if (strategy == 'none'):
        return train, test
        
    elif(strategy == 'standard' or strategy == 'robust'):
        non_binary_columns = [k for k, v in num_values_by_column.items() if v > 2]
        scaler = StandardScaler() if strategy == 'standard' else RobustScaler()
        train_non_binary = train[non_binary_columns]
        test_non_binary = test[non_binary_columns]
        scaler.fit(train_non_binary)
        train_non_binary = pd.DataFrame(scaler.transform(train_non_binary),
            columns = non_binary_columns, index = train.index)
        test_non_binary = pd.DataFrame(scaler.transform(test_non_binary),
            columns = non_binary_columns, index = test.index)

        train_scaled = train.drop(non_binary_columns, axis=1)
        test_scaled = test.drop(non_binary_columns, axis=1)
        train_scaled = train_scaled.merge(train_non_binary,
            left_index=True, right_index=True)
        test_scaled = test_scaled.merge(test_non_binary,
            left_index=True, right_index=True)
        return train_scaled, test_scaled

    else:
        print('unknown feature scaling strategy. try "{}", "{}", or "{}"'.format(
            'standard', 'robust', 'none'))
        return train, test

In [16]:
model_options = read_in_yaml()
print(model_options)

{'cohorts_training': 'all', 'features_included': {'grades': ['gpa_gr_3', 'gpa_gr_4', 'gpa_gr_5', 'gpa_gr_6', 'gpa_gr_7', 'gpa_gr_8', 'gpa_gr_9'], 'demographics': ['ethnicity', 'gender']}, 'validation_criterion': 'accuracy', 'cohorts_held_out': [2012], 'cohort_grade_level_begin': 'cohort_9th', 'outcome_name': 'not_on_time', 'missing_impute_strategy': 'median_plus_dummies', 'model_test_holdout': 'temporal_cohort', 'user_description': 'initial_skeleton_pipeline_test', 'model_classes_selected': ['logit', 'DT'], 'feature_scaling': 'robust', 'n_folds': 10, 'write_predictions_to_database': False, 'random_seed': 2187, 'parameter_cross_validation_scheme': 'leave_cohort_out', 'file_save_name': 'test_gpa_gender_ethnicity_imputed'}


In [17]:
# set seed for this program from model_options
np.random.seed(model_options['random_seed'])

In [18]:
type(model_options['write_predictions_to_database'])

bool

In [19]:
# Based on options, draw in data and select the appropriate
# labeled outcome column (outcome_name)
# cohort identification column (cohort_grade_level_begin)
# subset of various feature columns from various tables (features_included)

outcome_plus_features = build_outcomes_plus_features(model_options)

In [20]:
outcome_plus_features.head()

Unnamed: 0_level_0,not_on_time,cohort_9th,gpa_gr_3,gpa_gr_4,gpa_gr_5,gpa_gr_6,gpa_gr_7,gpa_gr_8,gpa_gr_9,ethnicity_A,ethnicity_B,ethnicity_H,ethnicity_I,ethnicity_M,ethnicity_nan,gender_F,gender_nan
student_lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
57296.0,0,2006,,,,,,,1.266667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58652.0,0,2006,,,,,,,3.529032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57294.0,0,2006,,,,,,,1.205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69065.0,1,2006,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63909.0,1,2006,,,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
if model_options['model_test_holdout'] == 'temporal_cohort':
    # if using temporal cohort model performance validation,
    # we choose the cohorts in cohorts_held_out for the test set
    train, test = temporal_cohort_test_split(outcome_plus_features,
        model_options['cohort_grade_level_begin'],
        model_options['cohorts_held_out'],
        model_options['cohorts_training'])

else:
    # if not using temporal test set, split randomly
    train, test = train_test_split(outcome_plus_features, test_size=0.20,
        random_state=model_options['random_seed'])

In [22]:
print(pd.unique(train.cohort_9th))
print(pd.unique(test.cohort_9th))

[2006 2007 2008 2009 2010 2011]
[2012]


In [23]:
# get subtables for each for easy reference
train_X = train.drop([model_options['outcome_name'],
    model_options['cohort_grade_level_begin']],axis=1)
test_X = test.drop([model_options['outcome_name'],
    model_options['cohort_grade_level_begin']],axis=1)
train_y = train[model_options['outcome_name']]
test_y = test[model_options['outcome_name']]

In [24]:
# do missing value feature imputation here
train_X, test_X = impute_missing_values(train_X, test_X,
    model_options['missing_impute_strategy'])
assert(all(train_X.columns == test_X.columns)), "train and test have different columns"

# do feature scaling here
train_X, test_X = scale_features(train_X, test_X,
    model_options['feature_scaling'])
assert(all(train_X.columns == test_X.columns)), "train and test have different columns"

In [25]:
clfs, params = define_clfs_params()

In [26]:
if model_options['parameter_cross_validation_scheme'] == 'none':
    # no need to further manipulate train dataset
    cohort_kfolds = 2 # hacky way to have GridSearchCV fit to 2 k-folds
elif model_options['parameter_cross_validation_scheme'] == 'leave_cohort_out':
    # choose another validation set amongst the training set to
    # estimate parameters and model selection across cohort folds
    print('leave_cohort_out')
    cohort_kfolds = LeaveOneLabelOut(train[model_options['cohort_grade_level_begin']])
elif model_options['parameter_cross_validation_scheme'] == 'k_fold':
    # ignore cohorts and use random folds to estimate parameter
    print('k_fold_parameter_estimation')
    cohort_kfolds = LabelKFold(train.index, n_folds=model_options['n_folds'])
else:
    print('unknown cross-validation strategy')

leave_cohort_out


In [27]:
print(cohort_kfolds.labels)
for train_fold, val_fold in cohort_kfolds:
    print('train: ', np.unique(train.iloc[train_fold].cohort_9th))
    print('validation: ', np.unique(train.iloc[val_fold].cohort_9th))

[2006 2006 2006 ..., 2011 2011 2011]
train:  [2007 2008 2009 2010 2011]
validation:  [2006]
train:  [2006 2008 2009 2010 2011]
validation:  [2007]
train:  [2006 2007 2009 2010 2011]
validation:  [2008]
train:  [2006 2007 2008 2010 2011]
validation:  [2009]
train:  [2006 2007 2008 2009 2011]
validation:  [2010]
train:  [2006 2007 2008 2009 2010]
validation:  [2011]


In [28]:
model_options['validation_criterion'] = 'accuracy' # 'average_precision' #' log_loss' # 'f1'

In [29]:
best_validated_models = clf_loop(clfs, params, train_X, train_y,
    criterion=model_options['validation_criterion'],
    models_to_run=model_options['model_classes_selected'],
    cv_folds=cohort_kfolds)

logit
model: logit cv_score: 0.7719510851602952
DT
model: DT cv_score: 0.7806544012338879


In [30]:
for model_name, model in best_validated_models.items():
    clf = model.best_estimator_
    if hasattr(clf, "decision_function"):
        test_set_scores = clf.decision_function(test_X)
    else:
        test_set_scores = clf.predict_proba(test_X)[:,1]

In [31]:
predicted_train = best_validated_models['logit'].predict_proba(train_X)
predicted_test = best_validated_models['logit'].predict_proba(test_X)

In [33]:
np.mean([probs == [0.5, 0.5] for probs in predicted_train])

0.0

In [34]:
np.mean([probs == [0.5, 0.5] for probs in predicted_test])

0.0

In [35]:
predicted_train

array([[ 0.61073809,  0.38926191],
       [ 0.90002904,  0.09997096],
       [ 0.59935857,  0.40064143],
       ..., 
       [ 0.62209806,  0.37790194],
       [ 0.62209806,  0.37790194],
       [ 0.62209806,  0.37790194]])

In [36]:
predicted_test

array([[ 0.89440181,  0.10559819],
       [ 0.79661955,  0.20338045],
       [ 0.91974039,  0.08025961],
       ..., 
       [ 0.89720486,  0.10279514],
       [ 0.85081305,  0.14918695],
       [ 0.76903548,  0.23096452]])

In [46]:
logit_coefs = best_validated_models['logit'].best_estimator_.coef_[0]

In [39]:
train_X.columns

Index(['ethnicity_A', 'ethnicity_B', 'ethnicity_H', 'ethnicity_I',
       'ethnicity_M', 'ethnicity_nan', 'gender_F', 'gpa_gr_3_isnull',
       'gpa_gr_4_isnull', 'gpa_gr_5_isnull', 'gpa_gr_6_isnull',
       'gpa_gr_7_isnull', 'gpa_gr_8_isnull', 'gpa_gr_9_isnull', 'gpa_gr_9',
       'gpa_gr_7', 'gpa_gr_8', 'gpa_gr_6', 'gpa_gr_5', 'gpa_gr_4'],
      dtype='object')

In [50]:
dict(zip(train_X.columns, logit_coefs))

{'ethnicity_A': 0.0,
 'ethnicity_B': 0.0,
 'ethnicity_H': 0.0,
 'ethnicity_I': 0.0,
 'ethnicity_M': 0.0,
 'ethnicity_nan': 0.0,
 'gender_F': 0.0,
 'gpa_gr_3_isnull': -1.6012998923350741,
 'gpa_gr_4': 0.0,
 'gpa_gr_4_isnull': 0.0,
 'gpa_gr_5': 0.0,
 'gpa_gr_5_isnull': 0.0,
 'gpa_gr_6': 0.0,
 'gpa_gr_6_isnull': 0.0,
 'gpa_gr_7': 0.0,
 'gpa_gr_7_isnull': 0.0,
 'gpa_gr_8': -0.14462472382581903,
 'gpa_gr_8_isnull': 0.0,
 'gpa_gr_9': -0.58884723669969719,
 'gpa_gr_9_isnull': 1.1028368947351592}

In [48]:
len(train_X.columns) == len(logit_coefs)

True

In [56]:
best_validated_models['logit'].grid_scores_

[mean: 0.75245, std: 0.01053, params: {'C': 1e-05, 'penalty': 'l1'},
 mean: 0.75708, std: 0.01358, params: {'C': 1e-05, 'penalty': 'l2'},
 mean: 0.75245, std: 0.01053, params: {'C': 0.0001, 'penalty': 'l1'},
 mean: 0.75719, std: 0.01497, params: {'C': 0.0001, 'penalty': 'l2'},
 mean: 0.75432, std: 0.01214, params: {'C': 0.001, 'penalty': 'l1'},
 mean: 0.76270, std: 0.01661, params: {'C': 0.001, 'penalty': 'l2'},
 mean: 0.77195, std: 0.01636, params: {'C': 0.01, 'penalty': 'l1'},
 mean: 0.77173, std: 0.01571, params: {'C': 0.01, 'penalty': 'l2'},
 mean: 0.72711, std: 0.07758, params: {'C': 0.1, 'penalty': 'l1'},
 mean: 0.75047, std: 0.04894, params: {'C': 0.1, 'penalty': 'l2'},
 mean: 0.73020, std: 0.07834, params: {'C': 1.0, 'penalty': 'l1'},
 mean: 0.72976, std: 0.07986, params: {'C': 1.0, 'penalty': 'l2'},
 mean: 0.73009, std: 0.07900, params: {'C': 10.0, 'penalty': 'l1'},
 mean: 0.72943, std: 0.07988, params: {'C': 10.0, 'penalty': 'l2'}]

In [58]:
sum(best_validated_models['logit'].predict(test_X))

299

In [60]:
best_validated_models['logit'].predict_proba(test_X)

array([[ 0.89440181,  0.10559819],
       [ 0.79661955,  0.20338045],
       [ 0.91974039,  0.08025961],
       ..., 
       [ 0.89720486,  0.10279514],
       [ 0.85081305,  0.14918695],
       [ 0.76903548,  0.23096452]])