In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn import svm

from supervised_sentiment_analysis import *
from constants import *
from utilities import *

In [None]:
merged_results = pickle.load(open('merged_results.P', 'rb'))
merged_results['Valid Vector'] = merged_results['Skip Thought Vector'].apply(lambda x: ~np.isnan(x).any())
removed_results = merged_results[~merged_results['Valid Vector']]
merged_results = merged_results[merged_results['Valid Vector']]

In [None]:
labeled_results = merged_results[merged_results['Categorical Tag'] != 'no tag']
q1_results = merged_results[merged_results['Question'] == Q1]
q1_labeled_results = labeled_results[labeled_results['Question'] == Q1]
q1_features = np.array(q1_labeled_results['Skip Thought Vector'].tolist())
q1_labels = np.array(q1_labeled_results['Categorical Tag'])

In [None]:
clf = svm.SVC()
clf.fit(q1_features, q1_labels)
predictions = clf.predict(list(q1_results['Skip Thought Vector']))
q1_results['prediction'] = predictions

In [None]:
def get_average_sentiment(results):
    y_pred = list(results['prediction'])
    total = max(len(y_pred), 1)
    positive_count = len([prediction for prediction in y_pred if prediction == 'positive'])
    positive_score = positive_count/total
    negative_count = len([prediction for prediction in y_pred if prediction == 'negative'])
    negative_score = negative_count/total
    
    return [positive_score, negative_score, 1-positive_score-negative_score, positive_score-negative_score]
get_average_sentiment(q1_results)

In [None]:
def get_sentiment_scores(results):
    problem_keys = {
        'fex1': 0,
        'fex2': 1,
        'fex4': 2,
        'ps1': 3,
        'ps2': 4,
        'ps4': 5,
    }
    results['problem key'] = results['Problem'].apply(lambda x: problem_keys[x])
    subsets = [[k] for k in range(6)]+[[0,3],[1,4],[2,5],[0,1,2],[3,4,5], list(range(6))]
    sentiment_scores = []
    for subset in subsets:
        subset_results = results[results['problem key'].isin(subset)]
        sentiment_scores.append(get_average_sentiment(subset_results))

    arr = np.array(sentiment_scores)
    
    index = ['fex1', 'fex2', 'fex4', 'ps1', 'ps2', 'ps4', '1', '2', '4', 'fex', 'ps', 'total']
    columns = ['Positive', 'Negative', 'Neutral', 'Sentiment']
    rounded = np.round(arr, 2)
    return pd.DataFrame(rounded, columns=columns, index=index)

In [None]:
get_sentiment_scores(q1_results)

In [None]:
positive_responses = q1_results[q1_results['prediction'] == 'positive']['Answer'].str.split().str.len()
negative_responses = q1_results[q1_results['prediction'] == 'negative']['Answer'].str.split().str.len()
neutral_responses = q1_results[q1_results['prediction'] == 'neutral']['Answer'].str.split().str.len()

In [None]:
def plot_lengths(results, stacked = False):
    CUTOFF = 20
    plt.figure(figsize=(10,10))
    bins = np.arange(0, CUTOFF, 1)
    labels = ['positive', 'negative', 'neutral']
    arr = plt.hist(results, 
                   stacked = stacked, 
                   bins = bins, 
                   alpha = 0.8, 
                   label = labels)
    plt.legend(prop={'size': 16})
    plt.grid(axis= 'y', alpha= 0.75)
    plt.xlabel('Number of Words', size = 18)
    plt.xticks(np.arange(0, CUTOFF+1, 2), size = 16)
    plt.yticks(size = 16)
    plt.ylabel('Number of Responses', size = 18)
    plt.title('Distribution of Word Count per Response', size = 18)
    # set up counts above each bar
    bin_width = arr[1][1]-arr[1][0]

plot_lengths([positive_responses, negative_responses, neutral_responses])

In [None]:
def answer_count_profile(username):
    results = q1_results[q1_results['username'] == username]
    return [len(results), len(results['prediction'].unique()), len(results[results['prediction'] == 'negative'])]

q1_results['answer count profile'] = q1_results['username'].apply(answer_count_profile)
q1_results['answer count'] = q1_results['answer count profile'].apply(lambda x: x[0])
q1_results['unique count'] = q1_results['answer count profile'].apply(lambda x: x[1])
q1_results['negative count'] = q1_results['answer count profile'].apply(lambda x: x[2])

In [None]:
swing_voter_results = q1_results[q1_results['unique count'] > 1]
len(swing_voter_results)

In [None]:
len(q1_results[(q1_results['unique count'] == 1) & (q1_results['answer count'] > 1)]['username'].unique())

In [None]:
len(q1_results[q1_results['answer count'] == 1]['username'].unique())

In [None]:
get_sentiment_scores(swing_voter_results)

In [None]:
negative_results = q1_results[(q1_results['negative count'] > 0) & (q1_results['answer count'] > 1)]
len(negative_results), len(negative_results['username'].unique())

In [None]:
get_sentiment_scores(negative_results)

In [None]:
unprocessed_results = get_problem_data(DATA)
results = merge_problem_data(unprocessed_results)
results['username'].unique().shape, results.columns, results['Question'].unique()

In [None]:
sections = {
    1: results[(results['Problem'] == 'fex1') | (results['Problem'] == 'ps1')],
    2: results[(results['Problem'] == 'fex2') | (results['Problem'] == 'ps2')],
    4: results[(results['Problem'] == 'fex4') | (results['Problem'] == 'ps4')]
}
def completed_x(username, section = 4):
    results_x = sections[section]
    return results_x[results_x['username'] == username].shape[0] > 0

In [None]:
results[((results['Problem'] == 'fex4') | (results['Problem'] == 'ps4'))]['username'].unique().shape
usernames = results['username'].unique()
usernames = pd.DataFrame(usernames, columns = ['username'])
usernames['completed_4'] = usernames['username'].apply(completed_x)

In [None]:
q1_results['completed_4'] = q1_results['username'].apply(completed_x)
q1_results['completed_1'] = q1_results['username'].apply(lambda x: completed_x(x, section = 1))
q1_results['completed_2'] = q1_results['username'].apply(lambda x: completed_x(x, section = 2))

- Want to find some way to classify students bassd on if they completed the course or not
- expectation is students who dropped out are more negative
- Students who responded to a question related to fex4 or ps4 probably did
- Students who did not may or may not have
- table represents difference in sentiment between two groups
- students who did respond to a section 4 problem (more likely to complete course) are surprisingly more negative

In [None]:
score_differences = (get_sentiment_scores(q1_results[q1_results['completed_4']])-get_sentiment_scores(q1_results[~q1_results['completed_4']]))
score_differences.loc[['fex1', 'fex2', 'ps1', 'ps2']]

- This shows that everyone is asked at least one question in response to question 1, so I think it means everyone is asked at least one question in response to every problem?
- Anyway, this shows a sort of metric on dropout rates, 48 students drop out out of the 1937 from section 1 to section 2
- A lot more drop out on the way to section 4

In [None]:
q1_results[q1_results['completed_4']].shape, q1_results[~q1_results['completed_4']].shape

In [None]:
q1_results[q1_results['completed_1']].shape, q1_results[~q1_results['completed_1']].shape

In [None]:
q1_results[q1_results['completed_2']].shape, q1_results[~q1_results['completed_2']].shape