In [1]:
import MDP_function as mf
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
import scipy.stats as stats
import random as rnd
%matplotlib inline

In [2]:
# read in all original data set
data_file = "MDP_Original_data.csv"
original_data = pd.read_csv(data_file)

In [3]:
# get names of all columns in original dataset
#original_data.columns.tolist()

In [4]:
# select all features from the original dataset and get features names as feature space indexes
feature_data = original_data.loc[:, 'Interaction':'CurrPro_medianProbTime']
feature_space = feature_data.columns.tolist()
#print feature_space

In [5]:
# initialize parameters and data structures for correlation-based feature selection algorithm
MAX_NUM_OF_FEATURES = 8
ECR_list = list()
optimal_feature_set = list()

In [6]:
def feature_discretization(feature_data, maxLevel=2): 
    # discretize continuous feature values into integers of no more than max levels
    isFloat = any(map(lambda x: isinstance(x, float), feature_data)) # check if it contain float type
    if not isFloat:
        isOverLevel = len(feature_data.unique())>maxLevel # check if it is within max levels
    if isFloat or isOverLevel: # discretize and reduce levels using median
        median = feature_data.median()
        feature_vals = map(lambda x: 0 if x<=median else 1, feature_data)
        feature_data = pd.Series(feature_vals, dtype=int)
    return feature_data

In [7]:
# discretization feature values by median
all_data_discretized = original_data.loc[:, "student":"reward"]
for i, ft in enumerate(feature_space):
    ft_data = original_data.loc[:, ft]
    all_data_discretized[ft] = feature_discretization(ft_data)
#all_data_discretized.describe()

In [8]:
# initialization to find the best feature with max ECR
for i, ft in enumerate(feature_space):
    #print i
    selected_feature = [ft]
    try:
        ECR_list.append(mf.compute_ECR(all_data_discretized, selected_feature))
    #ECR_list.append(mf.induce_policy_MDP2(all_data_discretized, selected_feature))
    except:
        ECR_list.append(0.0)
#print ECR_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [9]:
optimal_feature_set.append(feature_space[ECR_list.index(max(ECR_list))])
print(optimal_feature_set)

['difficultProblemCountSolved']


In [10]:
max(ECR_list)

27.495904195466874

In [11]:
#mf.compute_ECR(all_data_discretized, optimal_feature_set)

In [12]:
def compute_correlation(dataset, feature_set, feature):
    corr_sum = 0
    for ft in feature_set:
        corr, p_val = stats.pearsonr(dataset[ft], dataset[feature])
        corr_sum += corr
    return corr_sum

In [13]:
# feature selection iterations
while (len(optimal_feature_set) < MAX_NUM_OF_FEATURES):
    #print len(optimal_feature_set)
    corr_list = list()
    remain_feature_space = list(set(feature_space) - set(optimal_feature_set))
    for i,ft in enumerate(remain_feature_space):
        corr_list.append([ft, compute_correlation(all_data_discretized, optimal_feature_set, ft)])
    topK = 2
    top_features = map(lambda x: x[0], sorted(corr_list, key=lambda x: x[1], reverse=False)[:topK])
    ECR_list = list()
    for ft in top_features:
        selected_feature = list(optimal_feature_set)
        selected_feature.append(ft)
        ECR_with_ft_added = mf.compute_ECR(all_data_discretized, selected_feature)
        #ECR_with_ft_added = mf.induce_policy_MDP2(all_data_discretized, select_feature) #debug
        print "Candidate feature: "+ ft +" --> ECR value:"+ str(ECR_with_ft_added)
        ECR_list.append([ft, ECR_with_ft_added])
        #print ECR_list
    best_next_feature_and_ECR = sorted(ECR_list, key=lambda x: x[1], reverse=True)[0]
    best_next_feature, bestECR = best_next_feature_and_ECR[0], best_next_feature_and_ECR[1]
    optimal_feature_set.append(best_next_feature)
    #print best_next_feature_and_ECR

Candidate feature: BlankRatio --> ECR value:33.4743439469
Candidate feature: NextStepClickCountWE --> ECR value:39.7668341704
Candidate feature: cumul_easyProblemCountSolved --> ECR value:56.5619449309
Candidate feature: easyProblemCountSolved --> ECR value:39.8183677068
Candidate feature: cumul_SystemInfoHintCount --> ECR value:48.5516326293
Candidate feature: cumul_englishSymbolicSwitchCount --> ECR value:51.3223758798
Candidate feature: CurrPro_medianProbTime --> ECR value:45.830173095
Candidate feature: CurrPro_avgProbTime --> ECR value:55.4164742851
Candidate feature: Level --> ECR value:60.482117616
Candidate feature: difficultProblemCountWE --> ECR value:46.346391793
Candidate feature: CurrPro_avgProbTimeWE --> ECR value:40.1419223844
Candidate feature: CurrPro_avgProbTimeDeviationWE --> ECR value:48.4229954448
Candidate feature: cumul_symbolicRepresentationCount --> ECR value:46.4716826879
Candidate feature: probIndexinLevel --> ECR value:44.8321828239


  r = r_num / r_den


In [14]:
optimal_feature_set

['difficultProblemCountSolved',
 'NextStepClickCountWE',
 'cumul_easyProblemCountSolved',
 'cumul_englishSymbolicSwitchCount',
 'CurrPro_avgProbTime',
 'Level',
 'CurrPro_avgProbTimeDeviationWE',
 'cumul_symbolicRepresentationCount']

In [15]:
mf.induce_policy_MDP2(all_data_discretized, optimal_feature_set)

Policy: 
state -> action, value-function
0:1:0:0:1:0:1:0 -> PS, 52.4616738778
0:1:0:0:0:0:1:0 -> PS, 56.5044187532
0:1:0:0:1:0:1:1 -> PS, 26.4918464521
0:0:0:0:1:0:0:1 -> WE, 46.6360401318
1:0:0:0:1:0:0:1 -> PS, 54.8589414807
0:0:1:0:1:0:0:0 -> PS, 48.8754235743
0:0:1:0:0:0:0:1 -> WE, 54.3071011174
1:0:0:0:0:1:1:0 -> PS, 72.6326422387
0:1:0:0:1:1:1:0 -> PS, 64.7792934451
1:0:0:0:1:1:0:1 -> PS, 73.1838590003
1:0:0:0:0:1:0:0 -> PS, 87.6746485439
0:1:0:0:0:1:1:0 -> PS, 68.9349697427
0:1:0:0:0:1:0:0 -> PS, 61.9126419239
0:1:0:1:1:0:1:0 -> WE, 36.7823726553
0:1:0:1:0:0:1:0 -> PS, 40.8703921014
0:0:0:1:1:0:1:0 -> WE, 39.609617492
0:0:0:1:1:0:0:0 -> WE, 47.7371727013
1:0:0:0:1:1:1:0 -> PS, 103.099190796
0:0:0:0:1:0:1:1 -> WE, 47.214530969
1:0:0:0:1:0:1:0 -> PS, 63.6765967543
1:0:0:0:0:0:1:0 -> PS, 70.7528544752
0:0:0:0:1:1:1:0 -> PS, 52.1574427016
1:0:0:0:1:1:1:1 -> PS, 78.9062223644
0:0:0:0:0:0:1:0 -> PS, 47.2312192585
0:1:0:0:1:0:0:0 -> PS, 49.2841921128
0:0:0:0:0:1:1:0 -> WE, 57.8913413655

46.47168268789509

In [16]:
#mf.compute_ECR(all_data_discretized, optimal_feature_set)

In [17]:
#mf.compute_ECR(all_data_discretized, ['difficultProblemCountSolved', 'InterfaceErrorCount'])