In [1]:
import MDP_function as mf
import numpy as np 
import pandas as pd
import matplotlib.pylab as plt
import scipy.stats as stats
import random as rnd
%matplotlib inline

In [2]:
# read in all original data set
data_file = "MDP_Original_data.csv"
original_data = pd.read_csv(data_file)

In [3]:
# select all features from the original dataset and get features names as feature space indexes
feature_data = original_data.loc[:, 'Interaction':'CurrPro_medianProbTime']
feature_space = feature_data.columns.tolist()

In [4]:
# helper functions

def feature_discretization(feature_data, maxLevel=2): 
    # discretize continuous feature values into integers of no more than max levels
    isFloat = any(map(lambda x: isinstance(x, float), feature_data)) # check if it contain float type
    if not isFloat:
        isOverLevel = len(feature_data.unique())>maxLevel # check if it is within max levels
    if isFloat or isOverLevel: # discretize and reduce levels using median
        median = feature_data.median()
        feature_vals = map(lambda x: 0 if x<=median else 1, feature_data)
        feature_data = pd.Series(feature_vals, dtype=int)
    return feature_data

def compute_correlation(dataset, feature_set, feature):
#     corr_sum = 0
#     for ft in feature_set:
#         corr, p_val = stats.pearsonr(dataset[ft], dataset[feature])
#         corr_sum += corr
#     return corr_sum
    return rnd.random()

In [5]:
# initialize parameters and data structures for correlation-based feature selection algorithm
MAX_NUM_OF_FEATURES = 2
ECR_list = list()
optimal_feature_set = list()

In [6]:
# discretization feature values by median
all_data_discretized = original_data.loc[:, "student":"reward"]
for i, ft in enumerate(feature_space):
    ft_data = original_data.loc[:, ft]
    all_data_discretized[ft] = feature_discretization(ft_data)

In [7]:
# initialization to find the best feature with max ECR
for ft in feature_space:
    selected_features = [ft]
#     try:
#         ECR_list.append(mf.compute_ECR(all_data_discretized, selected_feature))
#     except:
    ECR_list.append(0.0)

In [8]:
# initialize the optimal feature set with feature of highest ECR
optimal_feature_set.append(feature_space[ECR_list.index(max(ECR_list))])
print(optimal_feature_set)

['Interaction']


In [9]:
# feature selection iterations
while (len(optimal_feature_set) < MAX_NUM_OF_FEATURES):
    #print len(optimal_feature_set)
    remain_feature_space = list(set(feature_space) - set(optimal_feature_set)) # features not in optimal feature set
    corr_list = list() # correlation between new feature and optimal feature set
    for i,ft in enumerate(remain_feature_space):
        corr_list.append([ft, compute_correlation(all_data_discretized, optimal_feature_set, ft)])
    topK = 2 # choose top-K candidate features based on feature similarity metrics
    top_features = map(lambda x: x[0], sorted(corr_list, key=lambda x: x[1], reverse=False)[:topK])
    ECR_list = list() # ECR values of optimal feature set with new candidate feature
    for ft in top_features:
        selected_feature = list(optimal_feature_set)
        selected_feature.append(ft) # combine candidate feature to optimal feature set
        ECR_with_ft_added = mf.compute_ECR(all_data_discretized, selected_feature)
        ECR_list.append([ft, ECR_with_ft_added])
        print "Candidate feature: "+ ft +" --> ECR value:"+ str(ECR_with_ft_added)
    best_next_feature, bestECR = sorted(ECR_list, key=lambda x: x[1], reverse=True)[0]
    optimal_feature_set.append(best_next_feature)

[['cumul_deletedApp', 4.0173507069031302]]
[['cumul_deletedApp', 4.0173507069031302], ['difficultProblemCountWE', 17.938717295288942]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
print optimal_feature_set

['Interaction', 'difficultProblemCountWE']


In [12]:
mf.induce_policy_MDP2(all_data_discretized, optimal_feature_set)

Policy: 
state -> action, value-function
1:0 -> WE, 18.1849003631
0:0 -> WE, 17.6456422146
0:1 -> PS, 21.176049136
1:1 -> WE, 18.9090058985
ECR value: 17.9387172953


17.938717295288942