In [1]:
import MDP_function as mf
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

In [2]:
# read in all original data set
data_file = "MDP_Original_data.csv"
original_data = pd.read_csv(data_file)

In [3]:
# get names of all columns in original dataset
original_data.columns.tolist()

['student',
 'currProb',
 'course',
 'session',
 'priorTutorAction',
 'reward',
 'Interaction',
 'hintCount',
 'TotalTime',
 'TotalPSTime',
 'TotalWETime',
 'avgstepTime',
 'avgstepTimePS',
 'stepTimeDeviation',
 'symbolicRepresentationCount',
 'englishSymbolicSwitchCount',
 'Level',
 'probDiff',
 'difficultProblemCountSolved',
 'difficultProblemCountWE',
 'easyProblemCountSolved',
 'easyProblemCountWE',
 'probAlternate',
 'easyProbAlternate',
 'RuleTypesCount',
 'UseCount',
 'PrepCount',
 'MorphCount',
 'OptionalCount',
 'NewLevel',
 'SolvedPSInLevel',
 'SeenWEinLevel',
 'probIndexinLevel',
 'probIndexPSinLevel',
 'InterfaceErrorCount',
 'RightApp',
 'WrongApp',
 'WrongSemanticsApp',
 'WrongSyntaxApp',
 'PrightAppRatio',
 'RrightAppRatio',
 'F1Score',
 'FDActionCount',
 'BDActionCount',
 'DirectProofActionCount',
 'InDirectProofActionCount',
 'actionCount',
 'UseWindowInfo',
 'NonPSelements',
 'AppCount',
 'AppRatio',
 'hintRatio',
 'BlankRatio',
 'HoverHintCount',
 'SystemInfoHintCou

In [4]:
# select all features from the original dataset and get features names as feature space indexes
feature_data = original_data.loc[:, 'Interaction':'CurrPro_medianProbTime']
feature_space = feature_data.columns.tolist()
print feature_space

['Interaction', 'hintCount', 'TotalTime', 'TotalPSTime', 'TotalWETime', 'avgstepTime', 'avgstepTimePS', 'stepTimeDeviation', 'symbolicRepresentationCount', 'englishSymbolicSwitchCount', 'Level', 'probDiff', 'difficultProblemCountSolved', 'difficultProblemCountWE', 'easyProblemCountSolved', 'easyProblemCountWE', 'probAlternate', 'easyProbAlternate', 'RuleTypesCount', 'UseCount', 'PrepCount', 'MorphCount', 'OptionalCount', 'NewLevel', 'SolvedPSInLevel', 'SeenWEinLevel', 'probIndexinLevel', 'probIndexPSinLevel', 'InterfaceErrorCount', 'RightApp', 'WrongApp', 'WrongSemanticsApp', 'WrongSyntaxApp', 'PrightAppRatio', 'RrightAppRatio', 'F1Score', 'FDActionCount', 'BDActionCount', 'DirectProofActionCount', 'InDirectProofActionCount', 'actionCount', 'UseWindowInfo', 'NonPSelements', 'AppCount', 'AppRatio', 'hintRatio', 'BlankRatio', 'HoverHintCount', 'SystemInfoHintCount', 'NextStepClickCountWE', 'PreviousStepClickCountWE', 'deletedApp', 'ruleScoreMP', 'ruleScoreDS', 'ruleScoreSIMP', 'ruleScore

In [5]:
# initialize parameters and data structures for correlation-based feature selection algorithm
MAX_NUM_OF_FEATURES = 8
ECR_list = list()
optimal_feature_set = list()

In [6]:
def feature_discretization(feature_data, maxLevel=2): 
    # discretize continuous feature values into integers of no more than max levels
    isFloat = any(map(lambda x: isinstance(x, float), feature_data)) # check if it contain float type
    if not isFloat:
        isOverLevel = len(feature_data.unique())>maxLevel # check if it is within max levels
    if isFloat or isOverLevel: # discretize and reduce levels using median
        median = feature_data.median()
        feature_vals = map(lambda x: 0 if x<=median else 1, feature_data)
        feature_data = pd.Series(feature_vals, dtype=int)
    return feature_data

In [None]:
# discretization feature values by median
all_data_discretized = original_data.loc[:, "student":"reward"]
for i, ft in enumerate(feature_space):
    ft_data = original_data.loc[:, ft]
    all_data_discretized[ft] = feature_discretization(ft_data)
all_data_discretized.describe()

Unnamed: 0,session,reward,Interaction,hintCount,TotalTime,TotalPSTime,TotalWETime,avgstepTime,avgstepTimePS,stepTimeDeviation,...,cumul_NextStepClickCountWE,cumul_PreviousStepClickCountWE,cumul_deletedApp,CurrPro_NumProbRule,CurrPro_avgProbTime,CurrPro_avgProbTimePS,CurrPro_avgProbTimeDeviationPS,CurrPro_avgProbTimeWE,CurrPro_avgProbTimeDeviationWE,CurrPro_medianProbTime
count,4396.0,4396.0,4396.0,4396.0,4396.0,4396.0,4396.0,4396.0,4396.0,4396.0,...,4396.0,4396.0,4396.0,4396.0,4396.0,4396.0,4396.0,4396.0,4396.0,4396.0
mean,2.872611,-0.357487,0.493631,0.199955,0.5,0.5,0.318926,0.5,0.203822,0.351911,...,0.421747,0.197452,0.453822,0.424704,0.476115,0.495905,0.494313,0.491811,0.468835,0.475205
std,2.514063,34.131956,0.500016,0.400011,0.500057,0.500057,0.466113,0.500057,0.402884,0.477621,...,0.493895,0.398122,0.49792,0.494354,0.499486,0.50004,0.500025,0.49999,0.499085,0.499442
min,1.0,-305.714286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.5,0.5,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,19.0,200.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# initialization to find the best feature with max ECR
for i, ft in enumerate(feature_space):
    #print i
    selected_feature = [ft]
    try:
        ECR_list.append(mf.compute_ECR(all_data_discretized, selected_feature))
    except:
        ECR_list.append(0.0)
print ECR_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
  span ) / _math.log(self.discount * k))


In [None]:
optimal_feature_set.append(feature_space[ECR_list.index(max(ECR_list))])

In [None]:
max(ECR_list)