In [1]:
import MDP_function as mf
import numpy as np 
import pandas as pd
import matplotlib.pylab as plt
import scipy.stats as stats
import random as rnd
import collections
import progressbar as pgb
import pickle
import time
import sys
import os
%matplotlib inline
pd.options.mode.chained_assignment = None # disable unnecessary warning

In [2]:
# read in all original data set
data_file = "MDP_Original_data2.csv"
original_data = pd.read_csv(data_file)

In [3]:
# select all features from the original dataset and get features names as feature space indexes
feature_data = original_data.loc[:, 'Interaction':'CurrPro_medianProbTime']
feature_space = feature_data.columns.tolist()

In [4]:
# initialize parameters and data structures for correlation-based feature selection algorithm
MAX_NUM_OF_FEATURES = 8
ECR_list_of_single_feature = list()
optimal_feature_set = list()
max_total_ECR = 0
initial_num_of_features = 1
extra_num_of_features = 1

In [5]:
# discretization feature values by median
all_data_discretized = original_data.loc[:, "student":"reward"]
print ">>> Feature discretization ... "
bar = pgb.ProgressBar()
for ft in bar(feature_space):
    ft_data = original_data.loc[:, ft]
    all_data_discretized[ft] = mf.feature_discretization_by_median(ft_data)
del original_data



>>> Feature discretization ... 


100% |########################################################################|


In [6]:
# initialization to find the best feature with max ECR
feature_ECR_rank_file = "feature_ECR_rank.pkl"
if os.path.exists(feature_ECR_rank_file):
    print ">>> Load ECR for each individual feature (may take a while)..."
    with open(feature_ECR_rank_file, "rb") as fin:
        ECR_list_of_single_feature = pickle.load(fin)
    print "\tSuccessful! Continue ..."
else:
    print ">>> Compute ECR for each individual feature (may take a while)..."
    bar = pgb.ProgressBar()
    for ft in bar(feature_space):
        ECR_list_of_single_feature.append([ft, mf.compute_ECR(all_data_discretized, [ft])])
    with open(feature_ECR_rank_file, "wb") as fout:
        pickle.dump(ECR_list_of_single_feature, fout)

>>> Load ECR for each individual feature (may take a while)...
	Successful! Continue ...


In [7]:
# initialize the optimal feature set with feature of highest ECR
ECR_list_of_single_feature = sorted(ECR_list_of_single_feature, key=lambda x: x[1], reverse=True) # sort feature by ECR
ECR_dict_of_single_feature = dict(ECR_list_of_single_feature)
feature_space = map(lambda x: x[0], ECR_list_of_single_feature) # update feature space by ECR order
optimal_feature_set.extend(map(lambda x: x[0], ECR_list_of_single_feature[:initial_num_of_features])) # select top 7 ECR features
start_time = time.time() # record start time to measure searching time
print "* Initial optimal feature selection is "
print optimal_feature_set
print "* Initial ECR is "
max_total_ECR = mf.compute_ECR(all_data_discretized, optimal_feature_set)
print str(max_total_ECR)

* Initial optimal feature selection is 
['probIndexinLevel']
* Initial ECR is 
19.3657551551


In [8]:
# feature selection iterations
valid_optimal_feature_set = list() # record the 8 optimal features.
valid_max_total_ECR = max_total_ECR if (len(optimal_feature_set)<=MAX_NUM_OF_FEATURES) else 0
prev_optimal_feature_set = list()
while (len(optimal_feature_set) < MAX_NUM_OF_FEATURES+extra_num_of_features):
    if (set(prev_optimal_feature_set) == set(optimal_feature_set)):
        break # if no change in optimal set, then end searching loop
    else:
        prev_optimal_feature_set = list(optimal_feature_set)
    print "\n********* Search next feature on level <"+str(len(optimal_feature_set))+"> *********"
    remain_feature_space = list([ft for ft in feature_space if ft not in optimal_feature_set])# features not in optimal feature set
    # feature selection heuristics
    print ">>> Select candidate feature set ..."
    topK = len(remain_feature_space)/2+int(0.01*np.exp(len(optimal_feature_set))) # dynamically choose top-K candidate features based on feature similarity metrics
    topK = min([topK, len(remain_feature_space)]) # check if topK is within capacity of candidate feature set
    count_features = 0
    #rnd.shuffle(remain_feature_space)
    top_features = remain_feature_space[:topK]
    # select optimal feature from candidate set based on ECR value
    ECR_list = list() # ECR values of optimal feature set with new candidate feature
    for ft in top_features:
        selected_feature = list(optimal_feature_set)
        selected_feature.append(ft) # combine candidate feature to optimal feature set
        ECR_with_ft_added = mf.compute_ECR(all_data_discretized, selected_feature)
        print "* Candidate feature: "+ ft +" --> ECR value: "+ str(ECR_with_ft_added)
        if (ECR_with_ft_added >= valid_max_total_ECR):
            print "\tQualified candidate feature added +"
            ECR_list.append([ft, ECR_with_ft_added])
        else:
            print "\tUnqualified candidate feature skipped ~"
    if (not ECR_list): # if no new qualified candidate feature, keep searching
        continue
    else:
        print ">>> test subset ECR ... "
        best_candidate_ft_set = list()
        bar = pgb.ProgressBar()
        for i in bar(range(len(ECR_list))):
            if (ECR_list[i][1] >= valid_max_total_ECR):
                candidate_ft_set = optimal_feature_set+[ECR_list[i][0]]
                max_total_ECR = ECR_list[i][1]
                is_subset_better = True
                best_candidate_ft_set = list(candidate_ft_set)
                subset_size = len(best_candidate_ft_set)
                while (subset_size>1 and is_subset_better):
                    choices = range(subset_size-1)
                    ECR_sublist = map(lambda f_id: mf.compute_ECR(all_data_discretized, best_candidate_ft_set[:f_id]+best_candidate_ft_set[f_id+1:]), choices)
                    max_ECR_in_subset = max(ECR_sublist)
                    if (subset_size-1<=MAX_NUM_OF_FEATURES and max_ECR_in_subset >= valid_max_total_ECR): # choose subset with ECR no less than highest overall ECR
                        print "!!!Better optimal feature subset is discovered!!!"
                        ft_index_of_max_subset_ECR = ECR_sublist.index(max_ECR_in_subset)
                        ft_removed = best_candidate_ft_set.pop(ft_index_of_max_subset_ECR)
                        max_total_ECR = max_ECR_in_subset
                        subset_size = len(best_candidate_ft_set)
                        if (subset_size <= MAX_NUM_OF_FEATURES):
                            valid_max_total_ECR = max_total_ECR
                            valid_optimal_feature_set = list(best_candidate_ft_set)
                        print "\tRemove feature "+str(ft_removed) 
                    elif (subset_size>MAX_NUM_OF_FEATURES+1 and max_ECR_in_subset >= max_total_ECR):
                        print "!!!Better optimal feature subset is discovered!!!"
                        ft_index_of_max_subset_ECR = ECR_sublist.index(max_ECR_in_subset)
                        ft_removed = best_candidate_ft_set.pop(ft_index_of_max_subset_ECR)
                        max_total_ECR = max_ECR_in_subset
                        subset_size = len(best_candidate_ft_set)
                        print "\tRemove feature "+str(ft_removed) 
                    else:
                        is_subset_better = False
        optimal_feature_set = list(best_candidate_ft_set)
        if (len(optimal_feature_set) <= MAX_NUM_OF_FEATURES) and (max_total_ECR >= valid_max_total_ECR):
            valid_optimal_feature_set = list(optimal_feature_set)
            valid_max_total_ECR = max_total_ECR
    # keep record of the highest ECR and its optimal feature set so far
    print "Max ECR touched so far: "+str(max_total_ECR)+" with "+str(len(optimal_feature_set))+" optimal features as:"
    print optimal_feature_set
    print "Highest valid ECR so far: "+str(valid_max_total_ECR)+" with "+str(len(valid_optimal_feature_set))+" optimal features as:"
    print valid_optimal_feature_set


********* Search next feature on level <1> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 19.3657551551
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeWE --> ECR value: 34.5608593787
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 31.0209214718
	Qualified candidate feature added +
* Candidate feature: difficultProblemCountSolved --> ECR value: 36.0716146758
	Qualified candidate feature added +
* Candidate feature: cumul_NonPSelements --> ECR value: 19.991206373
	Qualified candidate feature added +
* Candidate feature: cumul_DirectProofActionCount --> ECR value: 20.0366719515
	Qualified candidate feature added +
* Candidate feature: difficultProblemCountWE --> ECR value: 18.8759002972
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_Interaction --> ECR value: 19.8173190459
	Qualified candidate feature added +
* Candidate fea




	Remove feature probIndexinLevel
Max ECR touched so far: 20.1393876648 with 2 optimal features as:
['probIndexinLevel', 'UseCount']
Highest valid ECR so far: 20.1393876648 with 2 optimal features as:
['probIndexinLevel', 'UseCount']

********* Search next feature on level <2> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 20.1393876648
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeWE --> ECR value: 36.2275492616
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 31.7359289423
	Qualified candidate feature added +
* Candidate feature: difficultProblemCountSolved --> ECR value: 35.7929557292
	Qualified candidate feature added +
* Candidate feature: cumul_NonPSelements --> ECR value: 21.2034131577
	Qualified candidate feature added +
* Candidate feature: cumul_DirectProofActionCount --> ECR value: 21.1768036395
	Qualified candidate feature added +


100% |########################################################################|
  0% |                                                                        |


	Qualified candidate feature added +
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!
	Remove feature probIndexinLevel
!!!Better optimal feature subset is discovered!!!

  2% |#                                                                       |  4% |###                                                                     |


	Remove feature UseCount
!!!Better optimal feature subset is discovered!!!

  7% |#####                                                                   |  9% |#######                                                                 |


	Remove feature UseCount
Max ECR touched so far: 36.0716146758 with 2 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved']
Highest valid ECR so far: 36.0716146758 with 2 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved']

********* Search next feature on level <2> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 36.0716146758
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeWE --> ECR value: 64.4598094968
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 50.0209846235
	Qualified candidate feature added +
* Candidate feature: cumul_NonPSelements --> ECR value: 33.9340771807
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_DirectProofActionCount --> ECR value: 35.5997985407
	Unqualified candidate feature skipped ~
* Candidate feature: difficultProblemCountWE --> ECR value: 30.6941821306
	Un

 12% |########                                                                | 14% |##########                                                              | 17% |############                                                            | 19% |##############                                                          | 21% |###############                                                         | 24% |#################                                                       | 26% |###################                                                     | 29% |#####################                                                   | 31% |######################                                                  | 34% |########################                                                | 36% |##########################                                              | 39% |############################                                            | 41% |#############################     


	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!
	Remove feature probIndexinLevel
!!!Better optimal feature subset is discovered!!!

  5% |###                                                                     | 10% |#######                                                                 |


	Remove feature probIndexinLevel
Max ECR touched so far: 50.0209846235 with 3 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeDeviationWE']
Highest valid ECR so far: 50.0209846235 with 3 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeDeviationWE']

********* Search next feature on level <3> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 50.0209846235
	Unqualified candidate feature skipped ~
* Candidate feature: CurrPro_avgProbTimeWE --> ECR value: 51.7026621442
	Qualified candidate feature added +
* Candidate feature: cumul_NonPSelements --> ECR value: 50.7475448015
	Qualified candidate feature added +
* Candidate feature: cumul_DirectProofActionCount --> ECR value: 48.8919541983
	Unqualified candidate feature skipped ~
* Candidate feature: difficultProblemCountWE --> ECR value: 40.2957467205
	Unqualified candidate feature skipped ~
* Ca

 15% |###########                                                             | 21% |###############                                                         | 26% |##################                                                      | 31% |######################                                                  | 36% |##########################                                              | 42% |##############################                                          | 47% |##################################                                      | 52% |#####################################                                   | 57% |#########################################                               | 63% |#############################################                           | 68% |#################################################                       | 73% |#####################################################                   | 78% |##################################


	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!
	Remove feature CurrPro_avgProbTimeDeviationWE
Max ECR touched so far: 64.4598094968 with 3 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE']
Highest valid ECR so far: 64.4598094968 with 3 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE']

********* Search next feature on level <3> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 64.4598094968
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 51.7026621442
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_NonPSelements --> ECR value: 63.7704191981
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_DirectProofActionCount --> ECR value: 61.2208484322
	Unqualified candidate feature skip

100% |########################################################################|



	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
Max ECR touched so far: 66.6792880559 with 4 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_F1Score']
Highest valid ECR so far: 66.6792880559 with 4 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_F1Score']

********* Search next feature on level <4> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 66.6792880559
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 53.4465321134
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_NonPSelements --> ECR value: 62.9319912062
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_DirectProofActionCount --> ECR value: 62.5376658881
	Unqualified candidate feature skipped ~
* Candidate feature: difficultProblemCountWE --> ECR valu


  0% |                                                                        |


	Qualified candidate feature added +
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!




	Remove feature probIndexinLevel
Max ECR touched so far: 66.9452837723 with 5 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_F1Score', 'probAlternate']
Highest valid ECR so far: 66.9452837723 with 5 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_F1Score', 'probAlternate']

********* Search next feature on level <5> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 66.9452837723
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 53.6275343326
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_NonPSelements --> ECR value: 63.1595789454
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_DirectProofActionCount --> ECR value: 63.0097019379
	Unqualified candidate feature skipped ~
* Candidate feature: difficultProblemCountWE --> ECR val

100% |########################################################################|
  0% |                                                                        |


	Qualified candidate feature added +
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!
	Remove feature probIndexinLevel
!!!Better optimal feature subset is discovered!!!

 14% |##########                                                              | 28% |####################                                                    |


	Remove feature probAlternate
!!!Better optimal feature subset is discovered!!!

 42% |##############################                                          | 57% |#########################################                               |


	Remove feature probAlternate
Max ECR touched so far: 73.423435666 with 6 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_F1Score', 'probAlternate', 'cumul_BlankRatio']
Highest valid ECR so far: 73.423435666 with 6 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_F1Score', 'probAlternate', 'cumul_BlankRatio']

********* Search next feature on level <6> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 73.423435666
	Unqualified candidate feature skipped ~
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 56.9865531622
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_NonPSelements --> ECR value: 56.4284406878
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_DirectProofActionCount --> ECR value: 71.8788362488
	Unqualified candidate feature skipped ~
* Candidate featur

 71% |###################################################                     | 85% |#############################################################           |100% |########################################################################|
  0% |                                                                        |


	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!
	Remove feature probAlternate
!!!Better optimal feature subset is discovered!!!

  8% |######                                                                  | 16% |############                                                            |


	Remove feature cumul_F1Score
!!!Better optimal feature subset is discovered!!!

 25% |##################                                                      | 33% |########################                                                |


	Remove feature probAlternate
Max ECR touched so far: 90.1873888851 with 6 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_F1Score', 'cumul_BlankRatio', 'cumul_MorphCount']
Highest valid ECR so far: 90.1873888851 with 6 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_F1Score', 'cumul_BlankRatio', 'cumul_MorphCount']

********* Search next feature on level <6> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 90.1873888851
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 66.178669515
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_NonPSelements --> ECR value: 74.162845116
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_DirectProofActionCount --> ECR value: 95.7325857267
	Qualified candidate feature added +
* Candidate feature

100% |########################################################################|



	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!

 15% |##########                                                              | 20% |##############                                                          |


	Remove feature cumul_F1Score
!!!Better optimal feature subset is discovered!!!

 25% |##################                                                      | 30% |#####################                                                   | 35% |#########################                                               | 40% |############################                                            | 45% |################################                                        | 50% |####################################                                    | 55% |#######################################                                 |


	Remove feature cumul_F1Score
Max ECR touched so far: 194.014950247 with 6 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS']
Highest valid ECR so far: 194.014950247 with 6 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS']

********* Search next feature on level <6> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 194.014950247
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 117.652159161
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_NonPSelements --> ECR value: 194.437514986
	Qualified candidate feature added +
* Candidate feature: cumul_DirectProofActionCount --> ECR value: 177.627364939
	Unqualified candidate f

 60% |###########################################                             | 65% |##############################################                          | 70% |##################################################                      | 75% |######################################################                  | 80% |#########################################################               | 85% |#############################################################           | 90% |################################################################        | 95% |####################################################################    |100% |########################################################################|
  0% |                                                                        |


	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!




	Remove feature probIndexinLevel
!!!Better optimal feature subset is discovered!!!

 53% |######################################                                  | 61% |############################################                            |


	Remove feature cumul_BlankRatio
Max ECR touched so far: 198.485472067 with 7 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'easyProblemCountSolved']
Highest valid ECR so far: 198.485472067 with 7 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'easyProblemCountSolved']

********* Search next feature on level <7> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 198.485472067
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 119.623504075
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_NonPSelements --> ECR value: 197.383164293
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_DirectProofAction

 69% |#################################################                       | 76% |#######################################################                 | 84% |############################################################            | 92% |##################################################################      |100% |########################################################################|
  0% |                                                                        |


	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!
	Remove feature probIndexinLevel
!!!Better optimal feature subset is discovered!!!

 20% |##############                                                          | 40% |############################                                            |


	Remove feature easyProblemCountSolved
!!!Better optimal feature subset is discovered!!!
	Remove feature easyProblemCountSolved
Max ECR touched so far: 216.437100284 with 7 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount']
Highest valid ECR so far: 216.437100284 with 7 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'CurrPro_avgProbTimeWE', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount']

********* Search next feature on level <7> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 216.437100284
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 112.315795146
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_NonPSelements --> ECR value: 216.084008

100% |########################################################################|



	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!




	Remove feature probIndexinLevel
!!!Better optimal feature subset is discovered!!!

 52% |######################################                                  | 58% |##########################################                              | 64% |##############################################                          | 70% |##################################################                      | 76% |#######################################################                 | 82% |###########################################################             | 88% |###############################################################         | 94% |###################################################################     |100% |########################################################################|


	Remove feature CurrPro_avgProbTimeWE
Max ECR touched so far: 263.090783472 with 7 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount', 'ruleScoreEQUIV']
Highest valid ECR so far: 263.090783472 with 7 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount', 'ruleScoreEQUIV']

********* Search next feature on level <7> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 263.090783472
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeWE --> ECR value: 263.970420243
	Qualified candidate feature added +
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 106.187052457
	Unqualified candidate feature skipped ~
* Candidate feature: cumul_NonPSelements --> ECR value: 264.4





	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!




	Remove feature cumul_BlankRatio
Max ECR touched so far: 267.268001055 with 8 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount', 'ruleScoreEQUIV', 'NonPSelements']
Highest valid ECR so far: 267.268001055 with 8 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount', 'ruleScoreEQUIV', 'NonPSelements']

********* Search next feature on level <8> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 267.268001055
	Unqualified candidate feature skipped ~
* Candidate feature: CurrPro_avgProbTimeWE --> ECR value: 256.44385551
	Unqualified candidate feature skipped ~
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 99.1398363862
	Unqualified candidate feature skipped ~
* Candidate feature: cumu


  0% |                                                                        |


	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!
	Remove feature NonPSelements
!!!Better optimal feature subset is discovered!!!

 10% |#######                                                                 | 20% |##############                                                          | 30% |#####################                                                   |


	Remove feature NonPSelements
!!!Better optimal feature subset is discovered!!!

 40% |############################                                            | 50% |####################################                                    | 60% |###########################################                             | 70% |##################################################                      | 80% |#########################################################               | 90% |################################################################        |100% |########################################################################|


	Remove feature NonPSelements
Max ECR touched so far: 347.628993864 with 8 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount', 'ruleScoreEQUIV', 'cumul_TotalWETime']
Highest valid ECR so far: 347.628993864 with 8 optimal features as:
['probIndexinLevel', 'difficultProblemCountSolved', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount', 'ruleScoreEQUIV', 'cumul_TotalWETime']

********* Search next feature on level <8> *********
>>> Select candidate feature set ...
* Candidate feature: cumul_probIndexinLevel --> ECR value: 347.628993864
	Unqualified candidate feature skipped ~
* Candidate feature: CurrPro_avgProbTimeWE --> ECR value: 347.499505693
	Unqualified candidate feature skipped ~
* Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value: 271.392740244
	Unqualified candidate feature skipped ~
* Candidate feature


  0% |                                                                        |


	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!
	Remove feature probIndexinLevel
Max ECR touched so far: 351.630883377 with 8 optimal features as:
['difficultProblemCountSolved', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount', 'ruleScoreEQUIV', 'cumul_TotalWETime', 'cumul_symbolicRepresentationCount']
Highest valid ECR so far: 351.630883377 with 8 optimal features as:
['difficultProblemCountSolved', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount', 'ruleScoreEQUIV', 'cumul_TotalWETime', 'cumul_symbolicRepresentationCount']

********* Search next feature on level <8> *********
>>> Select candidate feature set ...
* Candidate feature: probIndexinLevel --> ECR value: 356.323704852
	Qualified candidate feature added +
* Candidate feature: cumul_probIndexinLevel --> ECR value: 356.323704852
	Qualified candidate feature added +
*

100% |########################################################################|



	Unqualified candidate feature skipped ~
>>> test subset ECR ... 
!!!Better optimal feature subset is discovered!!!




	Remove feature cumul_BlankRatio
Max ECR touched so far: 353.46045925 with 9 optimal features as:
['difficultProblemCountSolved', 'cumul_BlankRatio', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount', 'ruleScoreEQUIV', 'cumul_TotalWETime', 'cumul_symbolicRepresentationCount', 'cumul_easyProbAlternate']
Highest valid ECR so far: 352.596993622 with 8 optimal features as:
['difficultProblemCountSolved', 'cumul_MorphCount', 'CurrPro_avgProbTimeDeviationPS', 'cumul_FDActionCount', 'ruleScoreEQUIV', 'cumul_TotalWETime', 'cumul_symbolicRepresentationCount', 'cumul_AppRatio']


100% |########################################################################|


In [9]:
max_total_ECR = valid_max_total_ECR # retrieve upper limit ECR 
optimal_feature_set = valid_optimal_feature_set # retrieve feature set of upper limit ECR
time_cost = time.time() - start_time # time cost on feature selection

In [10]:
mf.induce_policy_MDP2(all_data_discretized, optimal_feature_set)

Policy: 
state -> action, value-function
1:0:1:0:1:1:0:0 -> PS, 330.173602171
1:1:1:0:0:1:0:0 -> PS, 458.131904362
0:1:1:0:1:0:0:0 -> PS, 375.998449566
0:1:0:0:0:0:1:0 -> PS, 288.343481064
1:1:1:0:0:0:0:0 -> PS, 375.648620583
0:1:0:1:1:1:0:1 -> WE, 320.037996657
1:1:0:0:0:1:0:1 -> PS, 410.042397188
1:1:0:1:0:1:1:0 -> PS, 397.232434409
1:1:0:1:0:1:1:1 -> PS, 474.581761618
1:1:1:1:1:1:0:1 -> PS, 322.644915992
1:0:0:0:1:1:0:0 -> WE, 354.457572707
0:1:1:0:1:0:0:1 -> PS, 333.524363479
0:1:1:1:0:0:0:0 -> WE, 281.409430848
0:1:1:1:0:0:0:1 -> WE, 307.272125115
0:0:1:0:0:0:1:1 -> PS, 379.344989984
0:0:1:0:0:0:1:0 -> PS, 285.503629778
0:0:1:0:1:0:1:0 -> WE, 316.005661291
1:0:0:1:0:1:1:0 -> PS, 451.861994535
1:1:0:0:1:0:0:0 -> PS, 190.904478465
1:1:0:0:1:0:0:1 -> WE, 237.921945599
0:0:1:0:1:1:0:0 -> PS, 393.842377807
1:1:1:0:1:1:1:0 -> PS, 263.771794927
0:0:0:0:0:1:0:1 -> PS, 288.06947183
0:1:0:0:0:1:0:1 -> WE, 272.966878368
0:1:1:1:1:1:0:0 -> PS, 347.802891157
0:1:1:1:1:1:0:1 -> PS, 374.32445230

352.59699362154498

In [11]:
print "* Optimal feature set:\n["+', '.join(optimal_feature_set)+']'

* Optimal feature set:
[difficultProblemCountSolved, cumul_MorphCount, CurrPro_avgProbTimeDeviationPS, cumul_FDActionCount, ruleScoreEQUIV, cumul_TotalWETime, cumul_symbolicRepresentationCount, cumul_AppRatio]


In [12]:
print "* Time cost in feature selection: "+str(time_cost)+' seconds'

* Time cost in feature selection: 12339.9450691 seconds
