In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt 
from sklearn.svm import SVC
from scipy.stats import mannwhitneyu
from scipy.stats import spearmanr
import seaborn as sns

from supervised_sentiment_analysis import *
from constants import *
from utilities import *

KeyboardInterrupt: 

In [None]:
merged_results = pickle.load(open('merged_results.P', 'rb'))
merged_results['Valid Vector'] = merged_results['Skip Thought Vector'].apply(lambda x: ~np.isnan(x).any())
removed_results = merged_results[~merged_results['Valid Vector']]
merged_results = merged_results[merged_results['Valid Vector']]
labeled_results = merged_results[merged_results['Categorical Tag'] != 'no tag']
q1_results = merged_results[merged_results['Question'] == Q1]
q1_labeled_results = labeled_results[labeled_results['Question'] == Q1]
q1_features = np.array(q1_labeled_results['Skip Thought Vector'].tolist())
q1_labels = np.array(q1_labeled_results['Categorical Tag'])

In [None]:
unprocessed_results = get_problem_data(DATA)
complete_results = merge_problem_data(unprocessed_results)
resource_results = complete_results[complete_results['Question'] == resource_question]

- get the resources associated with a student's response to question 1
- this is the student's response to question 6

In [None]:
def get_resources(row):
    username = row['username']
    problem = row['Problem']
    resources = resource_results[(resource_results['username'] == username) & (resource_results['Problem'] == problem)]['Answer']
    assert len(resources) < 2
    if len(resources) == 0:
        return []
    resources = resources.iloc[0]
    return resources.split(', ')
q1_results['resources'] = q1_results.apply(get_resources, axis = 1)

- get the number of resources used by each student

In [None]:
q1_results['resources']
def real_response_count(resource_list):
    if 'please answer the next question.' in resource_list:
        return len(resource_list)-1
    return len(resource_list)
    
q1_results['resource count'] = q1_results['resources'].apply(real_response_count)

- get student sentiments

In [None]:
clf = SVC()
clf.fit(q1_features, q1_labels)
predictions = clf.predict(list(q1_results['Skip Thought Vector']))
q1_results['sentiment'] = predictions

In [None]:
sentiment_dict = {
    'positive': 1,
    'neutral': 0,
    'negative': -1
}
q1_results['sentiment score'] = q1_results['sentiment'].apply(lambda x: sentiment_dict[x])

- Calculate table 5 from the paper (see paper)

In [None]:
def mannwhitney_resources(results, resource):
    results['used resource'] = results['resources'].apply(lambda x: resource in x)
    used = np.array(results[results['used resource']]['sentiment score'])
    unused = np.array(results[~results['used resource']]['sentiment score'])
    mwu = mannwhitneyu(used, unused, alternative = 'two-sided')
    return [len(used), np.round(np.mean(used), 3), len(unused), np.round(np.mean(unused), 3), mwu[1]]

mwu_stats = []
for resource_answer in resource_answers:
    mwu_stats.append(mannwhitney_resources(q1_results, resource_answer))
    
pd.DataFrame(np.array(mwu_stats), columns = ['used count', 'used sentiment', 'unused count', 'unused sentiment', 'Mann-Whitney U p-value'], index = resource_answers)

- number of resources used by sentiment, and overall

In [None]:
classes = ['positive', 'negative']
fig, ax = plt.subplots(figsize = (8,8))
sentiment_resource_counts = [list(q1_results[q1_results['sentiment'] == label]['resource count']) for label in classes]
bins = range(10)
arr = ax.hist(sentiment_resource_counts, bins, stacked=False, density=False, label = classes)
ax.legend(prop={'size': 16})
ax.set_xlabel('Number of resources used', size = 16)
ax.set_xticks(range(10))
ax.set_ylabel('Number of students', size = 16)

In [None]:
plt.figure(figsize = (8,8))
plt.hist(list(q1_results['resource count']), bins = range(10))

In [None]:
n = True

def normalize_array(a, axis=-1, order=2):
    l1 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l1[l1==0] = 1
    return a / np.expand_dims(l1, axis)

def get_problem_resource_counts(results, normalize = False):
    problem_resource_counts = []
    for problem in PROBLEMS:
        problem_count = []
        for resource in resource_answers:
            resource_results['good'] = results.apply(lambda x: resource in x['Answer'].split(', ') and x['Problem'] == problem, axis = 1)
            count = results[resource_results['good']].shape[0]/results[resource_results['Problem'] == problem].shape[0] if normalize else results[resource_results['good']].shape[0]
            problem_count.append(count)
        problem_resource_counts.append(problem_count)
    return pd.DataFrame(np.array(problem_resource_counts), columns = resource_answers, index = PROBLEMS)

problem_resource_df = get_problem_resource_counts(resource_results, normalize = n)
problem_resource_df

In [None]:
fig, ax = plt.subplots(figsize = (12,8))
fmt = '.2g' if n else 'd'
sns.heatmap(problem_resource_df, cmap=plt.cm.Blues, ax = ax, square = True, annot = True, fmt = fmt)

- Various analyses comparing how challenging students found the course to be to the number of resources used

In [None]:
challenge_results = complete_results[complete_results['Question'] == challenge_question]
def get_challenge(row):
    username = row['username']
    problem = row['Problem']
    challenge_prediction = challenge_results[(challenge_results['username'] == username) & (challenge_results['Problem'] == problem)]['Answer']
    assert len(challenge_prediction) < 2
    if len(challenge_prediction) == 0:
        return 'none'
    return challenge_prediction.iloc[0]

q1_results['challenge'] = q1_results.apply(get_challenge, axis = 1)
q1_results['challenge score'] = q1_results['challenge'].apply(lambda x: challenge_responses.index(x))

In [None]:
def spearmanr_challenge_resource_count(results):
    return spearmanr(list(results['challenge score']), list(results['resource count']))
spearmanr_challenge_resource_count(q1_results)

In [None]:
n = True

def get_challenge_resource_counts(results, normalize = False):
    challenge_resource_counts = []
    for challenge_response in challenge_responses:
        challenge_count = []
        for k in range(1, 9):
            results['good'] = results.apply(lambda x: x['resource count'] == k and x['challenge'] == challenge_response, axis = 1)
            count = results[results['good']].shape[0]/results[results['challenge'] == challenge_response].shape[0] if normalize else results[results['good']].shape[0]
            challenge_count.append(count)
        challenge_resource_counts.append(challenge_count)
    return pd.DataFrame(np.array(challenge_resource_counts), columns = range(1, 9), index = challenge_responses)

challenge_resource_df = get_challenge_resource_counts(q1_results, normalize = n)
challenge_resource_df

In [None]:
fig, ax = plt.subplots(figsize = (12,6))
fmt = '.2g' if n else 'd'
sns.heatmap(challenge_resource_df, cmap=plt.cm.Blues, ax = ax, square = True, annot = True, fmt = fmt)