## Correlation between task properties

### Prerperation
* Get all tasks with postQ annotation
* Check response types of annotations

In [3]:
from IPython.display import Latex
import pymongo
from pymongo import MongoClient
import numpy as np
import itertools
from scipy.stats import kendalltau, chi2_contingency

# DB connections
client = MongoClient()
db = client.db_tasklog_clean
# Collections
User = db.user
Log = db.log_chrome

In [4]:
# Get all tasks with postQ annotation
A = {}
bloom_map = {
    'remember': 1,
    'understadn': 2,
    'apply': 3,
    'analyse': 4,
    'evaluate': 5,
    'create': 6
}
for u in User.find({}):
    Q = u['postQ']['questionnaire']
    for q in Q:
        for question in Q[q].keys():
            answers = A.get(question, [])
            a = Q[q][question]
            # process bloom's taxonomy
            if question == 'task_complexity_objective':
                a = bloom_map[a]
            answers.append(a)
            A[question] = answers
# Check for 
for q in A:
    print q, set(A[q])
        

task_stage set([1, 2, 3, 4, 5])
task_collaboration set([1, 2, 3, 4, 5])
task_difficulty_subjective set([1, 2, 3, 4, 5])
task_sailence_subjective set([1, 2, 3, 4, 5])
task_urgency_subjective set([1, 2, 3, 4, 5])
task_goals_quantity set([u'single', u'multiple'])
task_frequency set([1, 2, 3, 4, 5])
task_satisfaction set([2, 3, 4, 5])
task_knowledge_topic set([1, 2, 3, 4, 5])
task_complexity_objective set([1, 2, 3, 4, 5, 6])
task_knowledge_procedure set([1, 2, 3, 4, 5])
task_length set([1, 2, 3, 4, 5])
task_complexity_subjective set([1, 2, 3, 4, 5])


## Correlation analysis
### 5-point scales
* task_stage
* task_collaboration
* task_difficulty_subjective
* task_sailence_subjective
* task_urgency_subjective
* task_frequency
* task_satisfaction
* task_knowledge_topic
* task_knowledge_procedure
* task_complexity_subjective

### Nominal 
* task_goals_quantity (single, multiple)
* task_complexity_objective (remember, understadn, apply, analyse, evaluate, create)


### Correlation between 5-point scale properties
* Data points: 91
* Methods:  kendal's tau rank correlation

In [5]:
keys = []
for k in A:
    if not k in ['task_goals_quantity']:
        keys.append(k)

corr = []
# latex output
print '\\begin{table}'
print'\\begin{tabular}{',
for i in range(len(keys)+1):
    print 'l',
print '}'
print '&' + ' & '.join([k.replace('task_', '').replace('_', ' ') for k in keys]) + '\\\\'
for k1 in range(len(keys)):
    print keys[k1].replace('task_', '').replace('_', ' '),
    for k2 in range(len(keys)):
        if k1 < k2:
            t, p = kendalltau(A[keys[k1]], A[keys[k2]])
            corr.append((keys[k1], keys[k2], t, p))
            print '&', 
            sign = ''
            if p < 0.05:
                print '\\textbf{', 
                sign = '*'
            if p < 0.01:
                sign = '**'
            print '%.2f'%t + sign, 
            #'(%.2f'%p + ')',
            if p < 0.05:
                print '}',
        else:
            print '& --',
    print '\\\\'        
print '\\end{tabular}'
print '\\end{table}'


\begin{table}
\begin{tabular}{ l l l l l l l l l l l l l }
&stage & collaboration & difficulty subjective & sailence subjective & urgency subjective & frequency & satisfaction & knowledge topic & complexity objective & knowledge procedure & length & complexity subjective\\
stage & -- & \textbf{ -0.14* } & -0.14 & 0.05 & \textbf{ 0.30** } & 0.01 & 0.09 & 0.04 & \textbf{ -0.15* } & 0.08 & \textbf{ -0.43** } & -0.14 \\
collaboration & -- & -- & 0.11 & \textbf{ -0.16* } & -0.12 & -0.02 & 0.01 & -0.06 & 0.05 & \textbf{ -0.33** } & \textbf{ 0.24** } & \textbf{ 0.21** } \\
difficulty subjective & -- & -- & -- & \textbf{ 0.26** } & 0.02 & -0.01 & \textbf{ -0.28** } & \textbf{ -0.20** } & \textbf{ 0.54** } & \textbf{ -0.31** } & \textbf{ 0.45** } & \textbf{ 0.66** } \\
sailence subjective & -- & -- & -- & -- & \textbf{ 0.49** } & 0.14 & 0.06 & 0.04 & \textbf{ 0.32** } & -0.05 & 0.14 & \textbf{ 0.36** } \\
urgency subjective & -- & -- & -- & -- & -- & 0.06 & 0.10 & 0.00 & 0.08 & -0.00 & -0.01 & 

### Significant correlations found between ordinal variables

In [6]:
print 'Negative correlations:'
print
for c in itertools.ifilter(lambda x: x[3]<0.05 and x[2]<0, corr):
    print c[0], '--', c[1], '\t', '%.2f'%c[2], '%.3f'%c[3]
print 
print 'Positive correlations:'
print
for c in itertools.ifilter(lambda x: x[3]<0.05 and x[2]>0, corr):
    print c[0], '--', c[1], '\t', '%.2f'%c[2], '%.3f'%c[3]

Negative correlations:

task_stage -- task_collaboration 	-0.14 0.049
task_stage -- task_complexity_objective 	-0.15 0.038
task_stage -- task_length 	-0.43 0.000
task_collaboration -- task_sailence_subjective 	-0.16 0.026
task_collaboration -- task_knowledge_procedure 	-0.33 0.000
task_difficulty_subjective -- task_satisfaction 	-0.28 0.000
task_difficulty_subjective -- task_knowledge_topic 	-0.20 0.006
task_difficulty_subjective -- task_knowledge_procedure 	-0.31 0.000
task_satisfaction -- task_complexity_objective 	-0.22 0.002
task_satisfaction -- task_complexity_subjective 	-0.30 0.000
task_knowledge_topic -- task_length 	-0.16 0.021
task_knowledge_topic -- task_complexity_subjective 	-0.22 0.002
task_complexity_objective -- task_knowledge_procedure 	-0.17 0.018
task_knowledge_procedure -- task_length 	-0.20 0.004
task_knowledge_procedure -- task_complexity_subjective 	-0.36 0.000

Positive correlations:

task_stage -- task_urgency_subjective 	0.30 0.000
task_collaboration -- task_l

### Chi-square test between nominal and ordinal variables
* Group 1-2, 3, 4-5 as different levels for the ordinal variables
* Check if each cell has 5 or more instances

In [97]:
bloom = A['task_complexity_objective']
subgoals = A['task_goals_quantity']

# group points scales into 3 levels
# 1-2, 3, 4-5
def group_points_3(points):
    P = []
    for p in points:
        if p < 3:
            P.append('low')
        elif p == 3:
            P.append('mid')
        elif p > 3:
            P.append('high')
    return P
    
# group points scales into 2 levels
# 1-3, 4-5
def group_points_2(points):
    P = []
    for p in points:
        if p <= 3:
            P.append('low')
        elif p > 3:
            P.append('high')
    return P   

# Check cell size for each variable
for k in keys:
    answers_3level = group_points_3(A[k])
    answers_2level = group_points_2(A[k])
    cells = {}
    print k
    for i in range(len(answers)):
        cells[(subgoals[i], answers_3level[i])] = cells.get((subgoals[i], answers_3level[i]), 0) + 1
    for c in sorted(cells.keys()):
        print c, cells[c]
    print
        



task_stage
(u'multiple', 'high') 25
(u'multiple', 'low') 10
(u'multiple', 'mid') 12
(u'single', 'high') 31
(u'single', 'low') 10
(u'single', 'mid') 3

task_collaboration
(u'multiple', 'high') 16
(u'multiple', 'low') 23
(u'multiple', 'mid') 8
(u'single', 'high') 7
(u'single', 'low') 37

task_difficulty_subjective
(u'multiple', 'high') 21
(u'multiple', 'low') 14
(u'multiple', 'mid') 12
(u'single', 'high') 10
(u'single', 'low') 27
(u'single', 'mid') 7

task_sailence_subjective
(u'multiple', 'high') 31
(u'multiple', 'low') 6
(u'multiple', 'mid') 10
(u'single', 'high') 26
(u'single', 'low') 11
(u'single', 'mid') 7

task_urgency_subjective
(u'multiple', 'high') 26
(u'multiple', 'low') 11
(u'multiple', 'mid') 10
(u'single', 'high') 26
(u'single', 'low') 12
(u'single', 'mid') 6

task_frequency
(u'multiple', 'high') 19
(u'multiple', 'low') 23
(u'multiple', 'mid') 5
(u'single', 'high') 15
(u'single', 'low') 18
(u'single', 'mid') 11

task_satisfaction
(u'multiple', 'high') 27
(u'multiple', 'low')

### bloom's taxonomy as ordinal variable

In [80]:
bloom_map = {
    'remember': 1,
    'understadn': 2,
    'apply': 3,
    'analyse': 4,
    'evaluate': 5,
    'create': 6
}
print 'cognitive complexity vs. '
for k in keys:
    a = [bloom_map[x] for x in A['task_complexity_objective']]
    t, p = kendalltau(A[k], a)
    if p < 0.05:
        print '\t', k, '%.2f'%t, '%.2f'%p

cognitive complexity vs. 
	task_stage -0.15 0.04
	task_difficulty_subjective 0.54 0.00
	task_sailence_subjective 0.32 0.00
	task_satisfaction -0.22 0.00
	task_knowledge_procedure -0.17 0.02
	task_length 0.48 0.00
	task_complexity_subjective 0.49 0.00


### Chi-square for subgoal quantity


In [95]:
# 2x2 contingency table
# single, multiple x low, high
# Loop over ordinal variables
answer_subgoal = A['task_goals_quantity']
row_map = {'single': 0, 'multiple': 1}
col_map = {'low': 0, 'high': 1}

print 'Significant effects found:'
print 'With or without sub-tasks vs. '
for k in keys:
    data = [[0, 0], [0, 0]]
    answers_2level = group_points_2(A[k])
    for i in range(len(answer_subgoal)):
        idx_row = row_map[answer_subgoal[i]]
        idx_col = col_map[answers_2level[i]]
        data[idx_row][idx_col] += 1
    chi2, p, dof, ex = chi2_contingency(np.array(data))
    
    if p < 0.05:
        print '\t', k, '%.2f'%chi2, '%.2f'%p
   

Significant effects found:
With or without sub-tasks vs. 
	task_collaboration 10.99 0.00
	task_difficulty_subjective 7.92 0.00
	task_length 24.27 0.00
	task_complexity_subjective 5.73 0.02


## Given the correlation between task properties, check if tasks can be described with less features (properties)
* PCA