In [1]:
import MDP_function as mf
import numpy as np 
import pandas as pd
import matplotlib.pylab as plt
import scipy.stats as stats
import random as rnd
%matplotlib inline

In [2]:
# read in all original data set
data_file = "MDP_Original_data.csv"
original_data = pd.read_csv(data_file)

In [3]:
# select all features from the original dataset and get features names as feature space indexes
feature_data = original_data.loc[:, 'Interaction':'CurrPro_medianProbTime']
feature_space = feature_data.columns.tolist()

In [4]:
# helper functions

def feature_discretization_by_median(feature_data, maxLevel=2): 
    # discretize continuous feature values into integers of no more than max levels
    isFloat = any(map(lambda x: isinstance(x, float), feature_data)) # check if it contain float type
    if not isFloat:
        isOverLevel = len(feature_data.unique())>maxLevel # check if it is within max levels
    if isFloat or isOverLevel: # discretize and reduce levels using median
        median = feature_data.median()
        feature_vals = map(lambda x: 0 if x<=median else 1, feature_data)
        feature_data = pd.Series(feature_vals, dtype=int)
    return feature_data

def compute_correlation(dataset, feature_set, feature):
    corr_sum = 0
    for ft in feature_set:
        corr, p_val = stats.pearsonr(dataset[ft], dataset[feature])
        corr_sum += corr
    return corr_sum

In [5]:
# initialize parameters and data structures for correlation-based feature selection algorithm
MAX_NUM_OF_FEATURES = 8
ECR_list = list()
optimal_feature_set = list()
max_total_ECR = 0

In [6]:
# discretization feature values by median
all_data_discretized = original_data.loc[:, "student":"reward"]
for i, ft in enumerate(feature_space):
    ft_data = original_data.loc[:, ft]
    all_data_discretized[ft] = feature_discretization_by_median(ft_data)

In [7]:
# initialization to find the best feature with max ECR
for ft in feature_space:
    selected_feature = [ft]
    ECR_list.append(mf.compute_ECR(all_data_discretized, selected_feature))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
# initialize the optimal feature set with feature of highest ECR
max_total_ECR = max(ECR_list)
optimal_feature_set.append(feature_space[ECR_list.index(max_total_ECR)])
print(optimal_feature_set)

['difficultProblemCountSolved']


In [9]:
# feature selection iterations
while (len(optimal_feature_set) < MAX_NUM_OF_FEATURES):
    print "####### Search next feature on iteration: "+str(len(optimal_feature_set))+" #######"
    remain_feature_space = list(set(feature_space) - set(optimal_feature_set)) # features not in optimal feature set
    # feature selection heuristics
    corr_list = list() # correlation between new feature and optimal feature set
    for i,ft in enumerate(remain_feature_space):
        corr_list.append([ft, compute_correlation(all_data_discretized, optimal_feature_set, ft)])
    topK = 2 # choose top-K candidate features based on feature similarity metrics
    top_features = map(lambda x: x[0], sorted(corr_list, key=lambda x: x[1], reverse=False)[:topK])
    # select optimal feature from candidate set based on ECR value
    ECR_list = list() # ECR values of optimal feature set with new candidate feature
    for ft in top_features:
        selected_feature = list(optimal_feature_set)
        selected_feature.append(ft) # combine candidate feature to optimal feature set
        ECR_with_ft_added = mf.compute_ECR(all_data_discretized, selected_feature)
        print "Candidate feature: "+ ft +" --> ECR value:"+ str(ECR_with_ft_added)
        if (ECR_with_ft_added > max_total_ECR):
            print "Qualified candidate feature added +"
            ECR_list.append([ft, ECR_with_ft_added])
        else:
            print "Unqualified candidate feature skipped -"
    if (not ECR_list): # if no new qualified candidate feature, keep searching
        continue
    else:
        best_next_feature, bestECR = sorted(ECR_list, key=lambda x: x[1], reverse=True)[0]
        optimal_feature_set.append(best_next_feature)
        max_total_ECR = bestECR
    # check potential for improving ECR over all subsets of optimal feature set
    size_of_optimal_feature_set = len(optimal_feature_set)
    if (size_of_optimal_feature_set >= MAX_NUM_OF_FEATURES): 
        ECR_list = map(lambda ft_index: mf.compute_ECR(all_data_discretized, 
                        optimal_feature_set[:ft_index]+optimal_feature_set[ft_index+1:]), 
                        range(size_of_optimal_feature_set-1)) # calculate ECR for optimal feature subsets
        max_ECR_in_subset = max(ECR_list)
        if (max_ECR_in_subset >= max_total_ECR): # choose subset with ECR no less than highest overall ECR
            print "Better optimal feature subset is discovered!"
            ft_index_of_max_subset_ECR = ECR_list.index(max_ECR_in_subset)
            optimal_feature_set = optimal_feature_set.pop(ft_index_of_max_subset_ECR)
            max_total_ECR = max_ECR_in_subset
    # keep record of the highest ECR and its optimal feature set so far
    print "Highest ECR so far: "+str(max_total_ECR)+" with optimal feature set as:"
    print optimal_feature_set

####### Search next feature on iteration: 1 #######
Candidate feature: BlankRatio --> ECR value:33.4743439469
Qualified candidate feature added +
Candidate feature: NextStepClickCountWE --> ECR value:39.7668341704
Qualified candidate feature added +
Highest ECR so far: 39.7668341704 with optimal feature set as 
[difficultProblemCountSolved NextStepClickCountWE]
####### Search next feature on iteration: 2 #######
Candidate feature: cumul_easyProblemCountSolved --> ECR value:56.5619449309
Qualified candidate feature added +
Candidate feature: easyProblemCountSolved --> ECR value:39.8183677068
Qualified candidate feature added +
Highest ECR so far: 56.5619449309 with optimal feature set as 
[difficultProblemCountSolved NextStepClickCountWE cumul_easyProblemCountSolved]
####### Search next feature on iteration: 3 #######
Candidate feature: cumul_SystemInfoHintCount --> ECR value:48.5516326293
Unqualified candidate feature skipped -
Candidate feature: cumul_englishSymbolicSwitchCount --> EC

  r = r_num / r_den


KeyboardInterrupt: 

In [10]:
print optimal_feature_set

['difficultProblemCountSolved', 'NextStepClickCountWE', 'cumul_easyProblemCountSolved']


In [11]:
mf.induce_policy_MDP2(all_data_discretized, optimal_feature_set)

Policy: 
state -> action, value-function
0:1:0 -> PS, 59.4199022203
0:0:0 -> WE, 53.5769673175
1:0:0 -> PS, 70.0346624956
0:0:1 -> WE, 46.185950439
1:1:0 -> PS, 69.4604801916
0:1:1 -> PS, 35.9431772767
1:0:1 -> WE, 0.0
ECR value: 56.5619449309


56.561944930883136