# About skills_detector.ipynb

This notebook detects skills in log files suchs as CVS.
This is a work in progress :)

In [1]:
%load_ext autoreload
%autoreload 1
%aimport utils_timeline_viz
from utils_timeline_viz import *
%matplotlib inline
matplotlib.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 25, 15
from matplotlib.backends.backend_pdf import PdfPages

## Quantitative CVS
### In table
First we extract all of their tables at all time points, analyze them for CVS and find the maximum number of values of a variable that was part of a control variable strategy instance.
For now, use of this skill relies on the following production rules (evaluated per variable):

* more than 2 successive records were done where only the variable was changed
* the outcome variable was different for both data points
* battery connected (caps)
* laser is on (beers)


In [37]:
def get_outcome_values(pts):
    outcomes = []
    for pt in pts:
        if "Charge" in pt.keys():
            outcomes.append(pt["Charge"])
        elif "Absorbance" in pt.keys():
            outcomes.append(pt["Absorbance"])
        else:
            print "OUTCOME MISSING"
    return outcomes

def detect_cvs_quant_in_table(table):
    cvs_variable = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}

    variable_changed = None
    new_var_changed = None
    number_of_changes = 1
    prev_point = None
    #data points in order of trial number, which is synonomous with 
    #order of capture since when you move data points around in the table,
    #they keep the same trial number
    for datapoint in sorted(table.iterkeys()): 
        curr_point = table[datapoint]
        if prev_point:
            values_of_2_points = get_values_per_variable([prev_point,curr_point])
            confounded = pts_are_confounded(values_of_2_points)
            outcomes = get_outcome_values([prev_point,curr_point])
            outcome1,outcome2 = outcomes[0],outcomes[1]
            #TO DO : ADD LASER ON AND BATTERY CONNECTED
            #we check that only one variable is changed,that the outcome values are different and that they are not null but floats
            if not confounded and isinstance(outcome1, float) and isinstance(outcome2, float):
                #find the variable being changed
                for v,vals in values_of_2_points.iteritems():
                    if len(vals) > 1:
                        new_var_changed = v
                if new_var_changed == None:
                    pass
                #update the number of changes for that variable
                if variable_changed == None or new_var_changed == variable_changed:
                    number_of_changes += 1
                else:
                    number_of_changes = 2
                variable_changed = new_var_changed
                if variable_changed in cvs_variable.keys(): #ignore cvs on detector, lightbulba nd connection
                    cvs_variable[variable_changed] = max(number_of_changes,cvs_variable[variable_changed])
            else:
                number_of_changes = 1
                variable_changed = None
        prev_point = curr_point.copy()
    return cvs_variable
            
            
            
def get_cvs_per_variable_in_table(df):
    tables = set(df['Table'])
    cvs_final = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}
    for i,t in enumerate(tables):
        table = read_table(t) #converts json string to dictionary
        #since many of their tables have CVS, we want to largest sample size for a variable used in CVS
        cvs_final = {k:max(v,detect_cvs_quant_in_table(table)[k]) for k,v in cvs_final.iteritems()}
    return cvs_final

### Test CVS detector for different scenarios

In [38]:
in_1_on = '''{"1": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 1, "Absorbance": 0.04, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 1}, 	    "2": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 2, "Absorbance": 0.02, "Wavelength": 582, "Concentration": 100.0, "Laser toggle": 1}}'''
out_1_on = {'Battery voltage': 0, 'Area': 0, 'Width': 0, 'Separation': 0, 'Wavelength': 0, 'Concentration': 2}
print out_1_on == detect_cvs_quant_in_table(read_table(in_1_on))

in_1_off = '''{"1": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 1, "Absorbance": 0.04, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 0}, 	    "2": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 2, "Absorbance": 0.02, "Wavelength": 582, "Concentration": 100.0, "Laser toggle": 0}}'''
out_1_off = {'Battery voltage': 0, 'Area': 0, 'Width': 0, 'Separation': 0, 'Wavelength': 0, 'Concentration': 2}
print out_1_off == detect_cvs_quant_in_table(read_table(in_1_off))

in_2_on = '''{"1": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 1, "Absorbance": 0.04, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 1}, 	    "2": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 2, "Absorbance": 0.02, "Wavelength": 582, "Concentration": 100.0, "Laser toggle": 1},
    "3": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1, "trialNumber": 3, "Absorbance": 0.04, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 1},
    "4": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 2, "trialNumber": 4, "Absorbance": 0.02, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 1}}'''
out_2_on = {'Battery voltage': 0, 'Area': 0, 'Width': 2, 'Separation': 0, 'Wavelength': 0, 'Concentration': 2}
print out_2_on == detect_cvs_quant_in_table(read_table(in_2_on))

in_1_on_off = '''{"1": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 1, "Absorbance": 0.04, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 0}, 	    "2": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 2, "Absorbance": 0.02, "Wavelength": 582, "Concentration": 100.0, "Laser toggle": 1}}'''
out_1_on_off = {'Battery voltage': 0, 'Area': 0, 'Width': 0, 'Separation': 0, 'Wavelength': 0, 'Concentration': 0}
print out_1_on_off == detect_cvs_quant_in_table(read_table(in_1_on_off))

True
True
True
True


In [27]:
from utils_read_parsing import *
for studentid in ["22340167"]:
    print studentid
    for sim in ['beers','capacitor']:
        parsed_file = find_student_log_file(sim,studentid)
        df = prep_parsing_data(parsed_file)
        cvs = get_cvs_per_variable_in_table(df)
        print cvs
        for var, n_samples in cvs.iteritems():
            if n_samples >0:
                print var, n_samples
    #     plot(df,to_plot_caps,family_name_to_code,function_to_use,colors)


22340167
{'Battery voltage': 0, 'Area': 0, 'Width': 23, 'Separation': 0, 'Wavelength': 11, 'Concentration': 20}
Width 23
Wavelength 11
Concentration 20
{'Battery voltage': 11, 'Area': 4, 'Width': 0, 'Separation': 7, 'Wavelength': 0, 'Concentration': 0}
Battery voltage 11
Area 4
Separation 7


### In graph
First we extract all of their tables at all time points, keep only points in the tables that are visible in the graph, analyze them for CVS and find the maximum number of values of a variable that was part of a control variable strategy instance.
For now, use of this skill relies on the following rules (evaluated per variable):
* more than records where only the variable was changed are in the graph
* battery connected (caps)
* laser is on (beers)


In [4]:
def detect_cvs_quant_in_graph(points):
    cvs_variable = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}

    values = get_values_per_variable(points)
    confounded = pts_are_confounded(values)
    #TO DO : ADD LASER ON AND BATTERY CONNECTED
    if not confounded:
        #find the variable being changed
        for v,vals in values.iteritems():
            if len(vals) > 1:
                cvs_variable[v] = len(vals)
    return cvs_variable
            
            
            
def get_cvs_per_variable_in_graph(df):
    tables = set(df['Table'])
    cvs_final = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}
    for i,t in enumerate(tables):
        table = read_table(t) #converts json string to dictionary
        pts = get_pts(table,in_graph=True) #grab all points
#         print pts
        cvs_final = {k:max(v,detect_cvs_quant_in_graph(pts)[k]) for k,v in cvs_final.iteritems()}
#         print cvs_final
#         print '\n'
    return cvs_final

## Let's run the detector on all students...
Since we don't yet have a metadata file to pull the right log data per student (and ingore student id that were test runs or unusable student data) were going to run the detector on all student ids have exactl one log file for each sims (~150).

In [6]:
from utils_read_parsing import *
def get_ids_with_exactly_one_of_each_logs():
    df_beers = get_latest_parsing_report('beers')
    df_caps = get_latest_parsing_report('capacitor')

    ids_beers = list(df_beers['studentid'].apply(str))
    ids_caps = list(df_caps['studentid'].apply(str))
    all_ids = ids_beers + ids_caps

    ids_by_count = {}
    for i in set(all_ids):
        count = all_ids.count(i)
        if count not in ids_by_count.keys():
            ids_by_count[count] = [i]
        else:
            ids_by_count[count].append(i)

    beers_only, caps_only, both_sims = [],[],[]
    for i in ids_by_count[2]:
        if i in ids_beers and i in ids_caps: both_sims.append(i)
    print "There are {0} ids with exactly one log for each sim".format(len(both_sims))
    return both_sims

In [7]:
ids = get_ids_with_exactly_one_of_each_logs()

Captured beers parsing report that was parsed on 2017-11-24_14.45.34
Captured capacitor parsing report that was parsed on 2017-11-08_14.55.27
There are 151 ids with exactly one log for each sim


In [8]:
table_cvs_df = pd.DataFrame({'student id':ids, 'Battery voltage':0, 'Area':0, 'Separation':0,'Wavelength':0,'Width':0, 'Concentration':0})
table_cvs_df =  table_cvs_df.set_index('student id')

for i,studentid in enumerate(ids):
    for sim in ['beers','capacitor']:
        print studentid,sim
        parsed_file = find_student_log_file(sim,studentid)
        df = prep_parsing_data(parsed_file)
        if not df.empty:
            try:
                cvs = get_cvs_per_variable_in_table(df)
                for var, n_samples in cvs.iteritems():
                    if n_samples >0:
                        table_cvs_df.loc[studentid][var] = n_samples
            except:
                print "FAILED"
                pass

14055131 beers


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


14055131 capacitor
15554169 beers
15554169 capacitor
14302168 beers
14302168 capacitor
19089138 beers
19089138 capacitor
13310139 beers
The parsed file has no user events, only model events. No dataframe prepared.
13310139 capacitor
15055169 beers
15055169 capacitor
15188167 beers
15188167 capacitor
41947147 beers
41947147 capacitor
19344143 beers
19344143 capacitor
61406163 beers
61406163 capacitor
12907149 beers
12907149 capacitor
18527162 beers
18527162 capacitor
18922151 beers
18922151 capacitor
11777163 beers
11777163 capacitor
19416160 beers
19416160 capacitor
19933165 beers
19933165 capacitor
13945160 beers
13945160 capacitor
15160164 beers
15160164 capacitor
14261165 beers
14261165 capacitor
19618321 beers
19618321 capacitor
10708152 beers
10708152 capacitor
11095146 beers
11095146 capacitor
17112137 beers
17112137 capacitor
60000168 beers
60000168 capacitor
17655165 beers
17655165 capacitor
19562150 beers
19562150 capacitor
12406161 beers
12406161 capacitor
19018165 beers
1901

In [None]:
table_cvs_df.head()

In [None]:
plt = table_cvs_df[['Wavelength','Width','Concentration']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,0.8))

In [None]:
plt = table_cvs_df[['Battery voltage','Area','Separation']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,1))

In [None]:
graph_cvs_df = pd.DataFrame({'student id':ids, 'Battery voltage':0, 'Area':0, 'Separation':0,'Wavelength':0,'Width':0, 'Concentration':0})
graph_cvs_df =  graph_cvs_df.set_index('student id')

for i,studentid in enumerate(ids):
    for sim in ['beers','capacitor']:
        parsed_file = find_student_log_file(sim,studentid)
        df = prep_parsing_data(parsed_file)
        if not df.empty:
            try:
                cvs = get_cvs_per_variable_in_graph(df)
                for var, n_samples in cvs.iteritems():
                    if n_samples >0:
                        graph_cvs_df.loc[studentid][var] = n_samples
            except:
                pass

In [None]:
graph_cvs_df.head()

In [None]:
plt = graph_cvs_df[['Wavelength','Width','Concentration']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,0.8))

In [None]:
plt = graph_cvs_df[['Battery voltage','Area','Separation']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,1))

In [None]:
graph_cvs_df.head()

In [None]:
def binarize(number):
    if number >0:
        return 1
    else:
        return 0

graph_cvs_df2 = graph_cvs_df
for c in graph_cvs_df.columns:
    graph_cvs_df2[c] = graph_cvs_df.apply(lambda row: binarize(row[c]), axis=1)
    
table_cvs_df2 = table_cvs_df
for c in table_cvs_df.columns:
    table_cvs_df2[c] = table_cvs_df.apply(lambda row: binarize(row[c]), axis=1)

In [None]:
graph_cvs_df2['sum'] = graph_cvs_df2.sum(axis=1)
table_cvs_df2['sum'] = table_cvs_df2.sum(axis=1)

In [None]:
graph_cvs_df2.head()