# About skills_detector.ipynb

This notebook detects skills in log files suchs as CVS.
This is a work in progress :)

In [1]:
%load_ext autoreload
%autoreload 1
%aimport utils_timeline_viz
from utils_timeline_viz import *
from utils_read_parsing import *
%matplotlib inline
matplotlib.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 25, 15
from matplotlib.backends.backend_pdf import PdfPages

## Quantitative CVS
### In table
First we extract all of their tables at all time points, analyze them for CVS and find the maximum number of values of a variable that was part of a control variable strategy instance.
For now, use of this skill relies on the following production rules (evaluated per variable):

* more than 2 successive records were done where only the variable was changed
* the outcome variable was different for both data points
* battery connected (caps)
* laser is on (beers)


In [2]:
def get_outcome_values(pts):
    outcomes = []
    for pt in pts:
        if "Charge" in pt.keys():
            outcomes.append(pt["Charge"])
        elif "Absorbance" in pt.keys():
            outcomes.append(pt["Absorbance"])
        else:
            print "OUTCOME MISSING"
    return outcomes

def detect_cvs_quant_in_table(table):
    cvs_variable = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}

    variable_changed = None
    new_var_changed = None
    number_of_changes = 1
    prev_point = None
    #data points in order of trial number, which is synonomous with 
    #order of capture since when you move data points around in the table,
    #they keep the same trial number
    for datapoint in sorted(table.iterkeys()): 
        curr_point = table[datapoint]
        if prev_point:
            values_of_2_points = get_values_per_variable([prev_point,curr_point])
            confounded = pts_are_confounded(values_of_2_points)
            outcomes = get_outcome_values([prev_point,curr_point])
            outcome1,outcome2 = outcomes[0],outcomes[1]
            #we check that only one variable is changed (confounded = False)
            # that the laser was not off, and that the connection wasn't to lightbulb
            # and that the outcome values are not null but floats
            if not confounded and 0 not in values_of_2_points["Laser toggle"] and "LIGHT_BULB_CONNECTED" not in values_of_2_points["Connection"] and isinstance(outcome1, float) and isinstance(outcome2, float):
                #find the variable being changed
                for v,vals in values_of_2_points.iteritems():
                    if len(vals) > 1:
                        new_var_changed = v
                if new_var_changed == None:
                    pass
                #update the number of changes for that variable
                if variable_changed == None or new_var_changed == variable_changed:
                    number_of_changes += 1
                else:
                    number_of_changes = 2
                variable_changed = new_var_changed
                if variable_changed in cvs_variable.keys(): #ignore cvs on detector, lightbulba nd connection
                    cvs_variable[variable_changed] = max(number_of_changes,cvs_variable[variable_changed])
            else:
                number_of_changes = 1
                variable_changed = None
        prev_point = curr_point.copy()
    return cvs_variable
            
            
            
def get_cvs_per_variable_in_table(df):
    tables = set(df['Table'])
    cvs_final = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}
    for i,t in enumerate(tables):
        table = read_table(t) #converts json string to dictionary
#         print table
        #since many of their tables have CVS, we want to largest sample size for a variable used in CVS
        cvs_final = {k:max(v,detect_cvs_quant_in_table(table)[k]) for k,v in cvs_final.iteritems()}
    return cvs_final

### Test CVS detector for different scenarios
If all return True, we are good to go

In [3]:
beers_in_1_on = '''{
    "1": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 1, "Absorbance": 0.04, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 1},     
    "2": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 2, "Absorbance": 0.02, "Wavelength": 582, "Concentration": 100.0, "Laser toggle": 1}}'''
beers_out_1_on = {'Battery voltage': 0, 'Area': 0, 'Width': 0, 'Separation': 0, 'Wavelength': 0, 'Concentration': 2}
# print beers_out_1_on
print beers_out_1_on == detect_cvs_quant_in_table(read_table(beers_in_1_on))

beers_in_1_off = '''{
    "1": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 1, "Absorbance": "nan", "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 0},     
    "2": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 2, "Absorbance": "nan", "Wavelength": 582, "Concentration": 100.0, "Laser toggle": 0}}'''
beers_out_1_off = {'Battery voltage': 0, 'Area': 0, 'Width': 0, 'Separation': 0, 'Wavelength': 0, 'Concentration': 0}
# print beers_out_1_off
print beers_out_1_off == detect_cvs_quant_in_table(read_table(beers_in_1_off))

beers_in_2_on = '''{
    "1": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 1, "Absorbance": 0.04, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 1},     
    "2": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 2, "Absorbance": 0.02, "Wavelength": 582, "Concentration": 100.0, "Laser toggle": 1},
    "3": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1, "trialNumber": 3, "Absorbance": 0.04, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 1},
    "4": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 2, "trialNumber": 4, "Absorbance": 0.02, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 1}}'''
beers_out_2_on = {'Battery voltage': 0, 'Area': 0, 'Width': 2, 'Separation': 0, 'Wavelength': 0, 'Concentration': 2}
# print beers_out_2_on
print beers_out_2_on == detect_cvs_quant_in_table(read_table(beers_in_2_on))

beers_in_1_on_off = '''{
    "1": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 1, "Absorbance": 0.04, "Wavelength": 582, "Concentration": 200.0, "Laser toggle": 0},     
    "2": {"Ruler location": {"y": 3.62, "x": 3.28}, "Detector location": {"y": 2.0, "x": 6.67}, "visible": false, "Width": 1.3, "trialNumber": 2, "Absorbance": 0.02, "Wavelength": 582, "Concentration": 100.0, "Laser toggle": 1}}'''
beers_out_1_on_off = {'Battery voltage': 0, 'Area': 0, 'Width': 0, 'Separation': 0, 'Wavelength': 0, 'Concentration': 0}
# print beers_out_1_on_off
print beers_out_1_on_off == detect_cvs_quant_in_table(read_table(beers_in_1_on_off))

caps_in_1_battery = '''{
    "1": {"Battery voltage": -1.5, "Area": 100.0, "Connection": "BATTERY_CONNECTED", "Capacitor voltage": -1.5, "visible": false, "Charge": -0.13, "trialNumber": 1, "Separation": 10.0}, 
    "2": {"Battery voltage": 0.3923, "Area": 100.0, "Connection": "BATTERY_CONNECTED", "Capacitor voltage": 0.3923, "visible": false, "Charge": 0.03, "trialNumber": 2, "Separation": 10.0}}'''
caps_out_1_battery = {'Battery voltage': 2, 'Area': 0, 'Width': 0, 'Separation': 0, 'Wavelength': 0, 'Concentration': 0}
# print caps_out_1_battery
print caps_out_1_battery == detect_cvs_quant_in_table(read_table(caps_in_1_battery))


caps_in_1_lightbulb = '''{
    "1": {"Battery voltage": -1.5, "Area": 100.0, "Connection": "LIGHT_BULB_CONNECTED", "Capacitor voltage": -1.5, "visible": false, "Charge": -0.13, "trialNumber": 1, "Separation": 10.0}, 
    "2": {"Battery voltage": 0.3923, "Area": 100.0, "Connection": "BATTERY_CONNECTED", "Capacitor voltage": 0.3923, "visible": false, "Charge": 0.03, "trialNumber": 2, "Separation": 10.0}}'''
caps_out_1_lightbulb = {'Battery voltage': 0, 'Area': 0, 'Width': 0, 'Separation': 0, 'Wavelength': 0, 'Concentration': 0}
# print caps_out_1_lightbulb
print caps_out_1_lightbulb == detect_cvs_quant_in_table(read_table(caps_in_1_lightbulb))

True
True
True
True
True
True


In [4]:
# from utils_read_parsing import *
# for studentid in ["22340167"]:
#     print studentid
#     for sim in ['beers','capacitor']:
#         parsed_file = find_student_log_file(sim,studentid)
#         df = prep_parsing_data(parsed_file)
#         cvs = get_cvs_per_variable_in_table(df)
#         print cvs
#         for var, n_samples in cvs.iteritems():
#             if n_samples >0:
#                 print var, n_samples
#     #     plot(df,to_plot_caps,family_name_to_code,function_to_use,colors)


### In graph
First we extract all of their tables at all time points, keep only points in the tables that are visible in the graph, analyze them for CVS and find the maximum number of values of a variable that was part of a control variable strategy instance.
For now, use of this skill relies on the following rules (evaluated per variable):
* more than records where only the variable was changed are in the graph
* battery connected (caps)
* laser is on (beers)


In [5]:
def detect_cvs_quant_in_graph(points):
    cvs_variable = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}

    values = get_values_per_variable(points)
    confounded = pts_are_confounded(values)
    #TO DO : ADD LASER ON AND BATTERY CONNECTED
    if not confounded:
        #find the variable being changed
        for v,vals in values.iteritems():
            if len(vals) > 1:
                cvs_variable[v] = len(vals)
    return cvs_variable
            
            
            
def get_cvs_per_variable_in_graph(df):
    tables = set(df['Table'])
    cvs_final = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}
    for i,t in enumerate(tables):
        table = read_table(t) #converts json string to dictionary
        pts = get_pts(table,in_graph=True) #grab all points
#         print pts
        cvs_final = {k:max(v,detect_cvs_quant_in_graph(pts)[k]) for k,v in cvs_final.iteritems()}
#         print cvs_final
#         print '\n'
    return cvs_final

## Let's run the detector on all students...
Since we don't yet have a metadata file to pull the right log data per student (and ingore student id that were test runs or unusable student data) were going to run the detector on all student ids have exactl one log file for each sims (~150).

In [6]:
ids = get_students_to_analyze()

In [8]:
table_cvs_df = pd.DataFrame({'student id':ids, 'Battery voltage':0, 'Area':0, 'Separation':0,'Wavelength':0,'Width':0, 'Concentration':0})
table_cvs_df =  table_cvs_df.set_index('student id')

for i,studentid in enumerate(ids):
    for sim in ['beers','capacitor']:
        print studentid,sim
        parsed_file = find_student_log_file(sim,studentid)
        if parsed_file:
            df = prep_parsing_data(parsed_file)
            if not df.empty:
                try:
                    cvs = get_cvs_per_variable_in_table(df)
                    for var, n_samples in cvs.iteritems():
                        if n_samples >0:
                            table_cvs_df.loc[studentid][var] = n_samples
                except:
                    print "FAILED"
                    pass
        else:
            print "No parsed file found for student {0} and sim {1}".format(studentid,sim)

11612162 beers
11612162 capacitor
13660166 beers
13660166 capacitor
41947147 beers
41947147 capacitor
64006159 beers
64006159 capacitor
15749160 beers
15749160 capacitor
The parsed file has no user events, only model events. No dataframe prepared.
16901162 beers
16901162 capacitor
51717164 beers
51717164 capacitor
10970160 beers
10970160 capacitor
17114169 beers
17114169 capacitor
17071167 beers
17071167 capacitor
17112137 beers
17112137 capacitor
19416160 beers
19416160 capacitor
11095146 beers
11095146 capacitor
11394167 beers
11394167 capacitor
89047164 beers
89047164 capacitor
13228164 beers
13228164 capacitor
11777163 beers
11777163 capacitor
13654167 beers
13654167 capacitor
86699164 beers
86699164 capacitor
15317162 beers
15317162 capacitor
14805169 beers
14805169 capacitor
10708152 beers
10708152 capacitor
15444164 beers
15444164 capacitor
13140165 beers
13140165 capacitor
15188167 beers
15188167 capacitor
10537160 beers
10537160 capacitor
18174154 beers
18174154 capacitor
1232

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


23784336 capacitor
23784336 beers
23784336 capacitor
36107164 beers
36107164 capacitor
No parsed file found for student 36107164 and sim capacitor
15072160 beers
15072160 capacitor
17931169 beers
17931169 capacitor
60000168 beers
60000168 capacitor
13578154 beers
13578154 capacitor
14261165 beers
14261165 capacitor
18527162 beers
18527162 capacitor
13407169 beers
13407169 capacitor
10420167 beers
10420167 capacitor
13193166 beers
13193166 capacitor
17502161 beers
17502161 capacitor
61406163 beers
61406163 capacitor
16606167 beers
16606167 capacitor
14302168 beers
14302168 capacitor
11384795 beers
11384795 capacitor
16136159 beers
16136159 capacitor
15496161 beers
15496161 capacitor
11997159 beers
11997159 capacitor
14088168 beers
14088168 capacitor
19933165 beers
19933165 capacitor
18866165 beers
18866165 capacitor
14002169 beers
14002169 capacitor
10375163 beers
10375163 capacitor
14293162 beers
14293162 capacitor


In [None]:
table_cvs_df.to_csv('table_cvs_results.txt', sep='\t')

In [None]:
table_cvs_df.head()

In [None]:
plt = table_cvs_df[['Wavelength','Width','Concentration']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,0.8))

In [None]:
plt = table_cvs_df[['Battery voltage','Area','Separation']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,1))

In [None]:
graph_cvs_df = pd.DataFrame({'student id':ids, 'Battery voltage':0, 'Area':0, 'Separation':0,'Wavelength':0,'Width':0, 'Concentration':0})
graph_cvs_df =  graph_cvs_df.set_index('student id')

for i,studentid in enumerate(ids):
    for sim in ['beers','capacitor']:
        parsed_file = find_student_log_file(sim,studentid)
        df = prep_parsing_data(parsed_file)
        if not df.empty:
            try:
                cvs = get_cvs_per_variable_in_graph(df)
                for var, n_samples in cvs.iteritems():
                    if n_samples >0:
                        graph_cvs_df.loc[studentid][var] = n_samples
            except:
                pass

In [None]:
graph_cvs_df.to_csv('graph_cvs_results.txt', sep='\t')

In [None]:
graph_cvs_df.head()

In [None]:
plt = graph_cvs_df[['Wavelength','Width','Concentration']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,0.8))

In [None]:
plt = graph_cvs_df[['Battery voltage','Area','Separation']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,1))

In [None]:
graph_cvs_df.head()

In [None]:
def binarize(number):
    if number >0:
        return 1
    else:
        return 0

graph_cvs_df2 = graph_cvs_df
for c in graph_cvs_df.columns:
    graph_cvs_df2[c] = graph_cvs_df.apply(lambda row: binarize(row[c]), axis=1)
    
table_cvs_df2 = table_cvs_df
for c in table_cvs_df.columns:
    table_cvs_df2[c] = table_cvs_df.apply(lambda row: binarize(row[c]), axis=1)

In [None]:
graph_cvs_df2['sum'] = graph_cvs_df2.sum(axis=1)
table_cvs_df2['sum'] = table_cvs_df2.sum(axis=1)

In [None]:
graph_cvs_df2.head()