# About skills_detector.ipynb

This notebook detects skills in log files suchs as CVS.
This is a work in progress :)

In [None]:
%load_ext autoreload
%autoreload 1
%aimport utils_timeline_viz
from utils_timeline_viz import *
%matplotlib inline
matplotlib.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 25, 15
from matplotlib.backends.backend_pdf import PdfPages

## Let's detect use of different skills in 2 students
We are going to load the activity of 2 students: one who clearly uses a quantitative version of CVS in Beers sim and one who does not do CVS in Caps sim. We analyze their tables to detect use of CVS.
### First we load their log data

In [None]:
# %aimport utils_timeline_viz
# studentid = '17931169'
# #main, cdw = 231
# parsed_file = find_student_log_file("capacitor",studentid)#,date='2017-03-21_17.28.42')
# df1 = prep_parsing_data(parsed_file)
# plot(df1,to_plot_caps,family_name_to_code,function_to_use,colors)

In [None]:
# studentid = '16317166'
# #main, cdw = 231
# parsed_file = find_student_log_file("beers",studentid,date='2017-03-21_17.28.42')
# df2 = prep_parsing_data(parsed_file)
# plot(df2,to_plot_beers,family_name_to_code,function_to_use,colors)

## Quantitative CVS
### In table
First we extract all of their tables at all time points, analyze them for CVS and find the maximum number of values of a variable that was part of a control variable strategy instance.
For now, use of this skill relies on the following production rules (evaluated per variable):
* more than 2 successive records were done where only the variable was changed
* the outcome variable was different for both data points


In [None]:
def get_outcome_values(pts):
    outcomes = []
    for pt in pts:
        if "Charge" in pt.keys():
            outcomes.append(pt["Charge"])
        elif "Absorbance" in pt.keys():
            outcomes.append(pt["Absorbance"])
    return outcomes

def detect_cvs_quant_in_table(table):
    cvs_variable = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}

    variable_changed = None
    new_var_changed = None
    number_of_changes = 1
    prev_point = None
    #data points in order of trial number, which is synonomous with 
    #order of capture since when you move data points around in the table,
    #they keep the same trial number
    for datapoint in sorted(table.iterkeys()): 
        curr_point = table[datapoint]
        if prev_point:
            values_of_2_points = get_values_per_variable([prev_point,curr_point])
            confounded = pts_are_confounded(values_of_2_points)
            outcomes = get_outcome_values([prev_point,curr_point])
#             print outcomes
            outcome1,outcome2 = outcomes[0],outcomes[1]
            #we check that only one variable is changed,
            #that the outcome values are different and that they are not null but floats
            if not confounded and outcome1!= outcome2 and isinstance(outcome1, float) and isinstance(outcome2, float):
                #find the variable being changed
                for v,vals in values_of_2_points.iteritems():
                    if len(vals) > 1:
                        new_var_changed = v
                if new_var_changed == None:
                    pass
#                     print values_of_2_points
                #update the number of changes for that variable
                if variable_changed == None or new_var_changed == variable_changed:
                    number_of_changes += 1
                else:
                    number_of_changes = 2
                variable_changed = new_var_changed
                if variable_changed in cvs_variable.keys(): #ignore cvs on detector, lightbulba nd connection
                    cvs_variable[variable_changed] = max(number_of_changes,cvs_variable[variable_changed])
            else:
                number_of_changes = 1
                variable_changed = None
        prev_point = curr_point.copy()
    return cvs_variable
            
            
            
def get_cvs_per_variable_in_table(df):
    tables = set(df['Table'])
    cvs_final = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}
    for i,t in enumerate(tables):
        table = read_table(t) #converts json string to dictionary
        print "\n\n TABLE", table
        cvs_final = {k:max(v,detect_cvs_quant_in_table(table)[k]) for k,v in cvs_final.iteritems()}
    return cvs_final

In [None]:
for studentid in ["18922151"]:
    print studentid
    parsed_file = find_student_log_file("beers",studentid)
    df = prep_parsing_data(parsed_file)
    cvs = get_cvs_per_variable_in_table(df)
    print cvs

In [None]:
print get_cvs_per_variable_in_table(df1)
print get_cvs_per_variable_in_table(df2)

In [None]:
def detect_cvs_quant_in_graph(points):
    cvs_variable = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}

    values = get_values_per_variable(points)
    confounded = pts_are_confounded(values)
    if not confounded:
        #find the variable being changed
        for v,vals in values.iteritems():
            if len(vals) > 1:
                cvs_variable[v] = len(vals)
    return cvs_variable
            
            
            
def get_cvs_per_variable_in_graph(df):
    tables = set(df['Table'])
    cvs_final = {"Battery voltage":0,
                        "Area":0,
                        "Separation":0,
                        "Width":0,
                        "Wavelength":0,
                        "Concentration":0}
    for i,t in enumerate(tables):
        table = read_table(t) #converts json string to dictionary
        pts = get_pts(table,in_graph=True) #grab all points
#         print pts
        cvs_final = {k:max(v,detect_cvs_quant_in_graph(pts)[k]) for k,v in cvs_final.iteritems()}
#         print cvs_final
#         print '\n'
    return cvs_final

In [None]:
print get_cvs_per_variable_in_graph(df1)
print get_cvs_per_variable_in_graph(df2)

## Let's run the detector on all students...
Since we don't yet have a metadata file to pull the right log data per student (and ingore student id that were test runs or unusable student data) were going to run the detector on all student ids have exactl one log file for each sims (~150).

In [None]:
from utils_read_parsing import *
def get_ids_with_exactly_one_of_each_logs():
    df_beers = get_latest_parsing_report('beers')
    df_caps = get_latest_parsing_report('capacitor')

    ids_beers = list(df_beers['studentid'].apply(str))
    ids_caps = list(df_caps['studentid'].apply(str))
    all_ids = ids_beers + ids_caps

    ids_by_count = {}
    for i in set(all_ids):
        count = all_ids.count(i)
        if count not in ids_by_count.keys():
            ids_by_count[count] = [i]
        else:
            ids_by_count[count].append(i)

    beers_only, caps_only, both_sims = [],[],[]
    for i in ids_by_count[2]:
        if i in ids_beers and i in ids_caps: both_sims.append(i)
    print "There are {0} ids with exactly one log for each sim".format(len(both_sims))
    return both_sims

In [None]:
ids = get_ids_with_exactly_one_of_each_logs()

In [None]:
table_cvs_df = pd.DataFrame({'student id':ids, 'Battery voltage':0, 'Area':0, 'Separation':0,'Wavelength':0,'Width':0, 'Concentration':0})
table_cvs_df =  table_cvs_df.set_index('student id')

for i,studentid in enumerate(ids):
    for sim in ['beers','capacitor']:
        print studentid,sim
        parsed_file = find_student_log_file(sim,studentid)
        df = prep_parsing_data(parsed_file)
        if not df.empty:
            try:
                cvs = get_cvs_per_variable_in_table(df)
                for var, n_samples in cvs.iteritems():
                    if n_samples >0:
                        table_cvs_df.loc[studentid][var] = n_samples
            except:
                print "FAILED"
                pass

In [None]:
table_cvs_df.head()

In [None]:
plt = table_cvs_df[['Wavelength','Width','Concentration']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,0.8))

In [None]:
plt = table_cvs_df[['Battery voltage','Area','Separation']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,1))

In [None]:
graph_cvs_df = pd.DataFrame({'student id':ids, 'Battery voltage':0, 'Area':0, 'Separation':0,'Wavelength':0,'Width':0, 'Concentration':0})
graph_cvs_df =  graph_cvs_df.set_index('student id')

for i,studentid in enumerate(ids):
    for sim in ['beers','capacitor']:
        parsed_file = find_student_log_file(sim,studentid)
        df = prep_parsing_data(parsed_file)
        if not df.empty:
            try:
                cvs = get_cvs_per_variable_in_graph(df)
                for var, n_samples in cvs.iteritems():
                    if n_samples >0:
                        graph_cvs_df.loc[studentid][var] = n_samples
            except:
                pass

In [None]:
graph_cvs_df.head()

In [None]:
plt = graph_cvs_df[['Wavelength','Width','Concentration']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,0.8))

In [None]:
plt = graph_cvs_df[['Battery voltage','Area','Separation']].plot.hist(alpha=0.7,bins=range(30),normed =True,ylim=(0,1))

In [None]:
graph_cvs_df.head()

In [None]:
def binarize(number):
    if number >0:
        return 1
    else:
        return 0

graph_cvs_df2 = graph_cvs_df
for c in graph_cvs_df.columns:
    graph_cvs_df2[c] = graph_cvs_df.apply(lambda row: binarize(row[c]), axis=1)
    
table_cvs_df2 = table_cvs_df
for c in table_cvs_df.columns:
    table_cvs_df2[c] = table_cvs_df.apply(lambda row: binarize(row[c]), axis=1)

In [None]:
graph_cvs_df2['sum'] = graph_cvs_df2.sum(axis=1)
table_cvs_df2['sum'] = table_cvs_df2.sum(axis=1)

In [None]:
graph_cvs_df2.head()