# About activity_descriptors_detector.ipynb

This notebook detects different aspects of activities from log files.
This is a work in progress :)

In [19]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

In [20]:
%load_ext autoreload
%autoreload 1
%aimport utils_timeline_viz
from utils_timeline_viz import *
from utils_read_parsing import *
from tabulate import tabulate
%matplotlib inline
matplotlib.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 10,6

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Grabbing log data and building a generic skills detector

In [21]:
log_files_per_sim = {}
for sim in ['beers','capacitor']:
    log_files_per_sim[sim] = get_parsed_log_files_per_student_for_sim(sim)

The file beers_log_files_per_student.txt has been unpickled and loaded
The file capacitor_log_files_per_student.txt has been unpickled and loaded


In [22]:
students = log_files_per_sim['beers'].keys()
order = get_student_metadata()['activity order'].to_dict()

In [23]:
def detect_skill(log_files_per_sim,students,initial_value,update_function,get_use_function,ignore_model_events=True):
    '''
    This function is a generic skills detector.
    
    Args:
        students (list): All student to run detector on
        log_files_per_sim (dict): dictionary by sim and student of log file names
        initial_value (): False for binary usage, 0 for number of times students did something, etc...
        update_function (function): takes use from 1st log file and compares it to the next using max, sum, min, etc...
        get_use_function (function): takes a dataframe and returns what it detected

    Returns:
        skill_use (dict): a dictionary by sim by student of use of skill
    '''
    skill_use = {'beers':{sid:initial_value for sid in students},'capacitor':{sid:initial_value for sid in students}}
    
    for sim in ['beers','capacitor']:
        for sid in students:
            logs = log_files_per_sim[sim][sid]
            use = initial_value

            for log in logs:
                df = prep_parsing_data(log,remove_model_events=ignore_model_events)
                if not df.empty:
                    use = update_function([use, get_use_function(df)])
            
            skill_use[sim][sid] = use
    
    return skill_use

# Investigation begins
## Activity duration and user actions

In [41]:
students = [10866146] #for testing purposes

In [25]:
def get_activity_duration(df):
    return list(df.sort_values(by='Time')['Time'])[-1]
duration = detect_skill(log_files_per_sim,students,0,sum,get_activity_duration,ignore_model_events=False)
print duration

{'beers': {10866146: 1045.0640000000001}, 'capacitor': {10866146: 851.68599999999992}}


In [42]:
def get_number_actions(df):
    #ignore model events count dragging as 1 event (by only counting dragStarted)
    return df[(df['Event']!='dragged')&(df['Event']!='dragEnded')].shape[0]
nactions = detect_skill(log_files_per_sim,students,0,sum,get_number_actions,ignore_model_events=True)
print nactions

{'beers': {10866146: 274}, 'capacitor': {10866146: 61}}


## Exploration measures

In [88]:
def get_time_second_record(df):
    times = list(df[df['Event']=='recording data']['Time'])
    if len(times)>=2:
        times.sort()
        return times[1] #return the time of the second record
    else:
        return list(df.sort_values(by='Time')['Time'])[-1] #else, return last time stamp
second_record = detect_skill(log_files_per_sim,students,0,max,get_time_second_record,ignore_model_events=False)
print second_record

{'beers': {10866146: 551.63499999999999}, 'capacitor': {10866146: 778.72899999999993}}


## Uses of different components

In [91]:
def get_use_concentration_plus_minus(df):
    items = set(df['Item'])
    if 'solutionControls' in items:
        return 1
    else:
        return 0    
concentration_weird_controls = detect_skill(log_files_per_sim,students,0,max,get_use_concentration_plus_minus,ignore_model_events=True)
print concentration_weird_controls

{'beers': {10866146: 1}, 'capacitor': {10866146: 0}}


In [93]:
def get_use_restore(df):
    if 'Restoring sim state from trial' in df['Event'].tolist():
        return 1
    else:
        return 0
use_restore = detect_skill(log_files_per_sim,students,0,max,get_use_restore,ignore_model_events=True)
print use_restore

{'beers': {10866146: 1}, 'capacitor': {10866146: 0}}


# Storing findings in dataframe

In [89]:
df = pd.concat([pd.DataFrame({'sid':students,'sim':'C'}),pd.DataFrame({'sid':students,'sim':'B'})])
df['order'] = df['sid'].apply(lambda sid: order[sid])
df['duration'] = df.apply(lambda row: duration['beers'][row['sid']] if row['sim']=='B' else duration['capacitor'][row['sid']], axis=1)
df['nactions'] = df.apply(lambda row: nactions['beers'][row['sid']] if row['sim']=='B' else nactions['capacitor'][row['sid']], axis=1)
df['time_second_record'] = df.apply(lambda row: second_record['beers'][row['sid']] if row['sim']=='B' else second_record['capacitor'][row['sid']], axis=1)
df.reindex()

Unnamed: 0,sid,sim,order,duration,nactions,time_second_record
0,10866146,C,CL,851.686,61,778.729
0,10866146,B,CL,1045.064,274,551.635


# Plotting findings (UPDATE ME)
## How long are activities?

In [None]:
fig, ax = plt.subplots()
a_heights, a_bins = np.histogram(results_light['activity duration'],bins=15)
b_heights, b_bins = np.histogram(results_caps['activity duration'], bins=a_bins)

width = (a_bins[1] - a_bins[0])/3

ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue')
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='violet')
plt.legend()
print a_bins

In [None]:
print np.median(results_light['activity duration'])/60,  np.mean(results_light['activity duration'])/60,  np.std(results_light['activity duration'])/60
print np.median(results_caps['activity duration'])/60,  np.mean(results_caps['activity duration'])/60,  np.std(results_caps['activity duration'])/60

## How long do students explore in each sim?
### Overall

In [None]:
fig, ax = plt.subplots()
a_heights, a_bins = np.histogram(results_light['exploration duration'],bins=15)
b_heights, b_bins = np.histogram(results_caps['exploration duration'], bins=a_bins)

width = (a_bins[1] - a_bins[0])/3

ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue')
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='violet')
plt.legend()
print a_bins

### by order

In [None]:
fig, ax = plt.subplots()
a_heights, a_bins = np.histogram(results_light[results_light['activity order']=='CL']['exploration duration'],bins=12)
b_heights, b_bins = np.histogram(results_light[results_light['activity order']=='LC']['exploration duration'], bins=a_bins)
c_heights, c_bins = np.histogram(results_caps[results_light['activity order']=='CL']['exploration duration'], bins=a_bins)
d_heights, d_bins = np.histogram(results_caps[results_light['activity order']=='LC']['exploration duration'], bins=a_bins)

width = (a_bins[1] - a_bins[0])/6

ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue',hatch="//",label='light CL')
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='cornflowerblue',label='light LC')
ax.bar(c_bins[:-1]+width*3, c_heights, width=width, facecolor='violet',label='caps CL')
ax.bar(d_bins[:-1]+width*4, d_heights, width=width, facecolor='violet',hatch="//",label='caps LC')
plt.legend()
print a_bins

In [None]:
from scipy.stats import mannwhitneyu
p = [['','CL','LC','mannwhitney U','p-value']]
l_CL = results_light[results_light['activity order']=='CL']['exploration duration'].values
l_LC = results_light[results_light['activity order']=='LC']['exploration duration'].values
c_CL = results_caps[results_caps['activity order']=='CL']['exploration duration'].values
c_LC = results_caps[results_caps['activity order']=='LC']['exploration duration'].values
m,pv = mannwhitneyu(l_CL,l_LC)
p.append(['light',str(np.mean(l_CL))+'+-'+str(np.std(l_CL)),str(np.mean(l_LC))+'+-'+str(np.std(l_LC)),m,pv])
m,pv = mannwhitneyu(c_CL,c_LC)
p.append(['caps',str(np.mean(c_CL))+'+-'+str(np.std(c_CL)),str(np.mean(c_LC))+'+-'+str(np.std(c_LC)),m,pv])
print tabulate(p)