In [1]:
import pickle
import nltk
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from nltk.collocations import *
from nltk.metrics import TrigramAssocMeasures 
from nltk.corpus import stopwords
from sklearn.svm import SVC

from graphing import *
from constants import *
from utilities import *
from preprocess import *

ImportError: No module named 'nltk'

- Preliminary stuff, load data and calculate sentiments

In [None]:
merged_results = pickle.load(open('merged_results.pickle', 'rb'))
merged_results['Valid Vector'] = merged_results['Skip Thought Vector'].apply(lambda x: ~np.isnan(x).any())
removed_results = merged_results[~merged_results['Valid Vector']]
merged_results = merged_results[merged_results['Valid Vector']]
labeled_results = merged_results[merged_results['Categorical Tag'] != 'no tag']
q1_results = merged_results[merged_results['Question'] == Q1]
q1_labeled_results = labeled_results[labeled_results['Question'] == Q1]

In [None]:
q1_features = np.array(q1_labeled_results['Skip Thought Vector'].tolist())
q1_labels = np.array(q1_labeled_results['Categorical Tag'])
clf = SVC()
clf.fit(q1_features, q1_labels)
predictions = clf.predict(list(q1_results['Skip Thought Vector']))
q1_results['sentiment'] = predictions
q2_results = merged_results[merged_results['Question'] == Q2]

- Find common phrases in the responses
    - results: data to be used, should be pandas Series
    - use_trigrams: if true, find trigram, if false, find bigrams
    - freq_filter: only includes responses if they appear at least this many times in the text
    - result_count: the number of phrases to return
    - ignore_stops: if true, filter out any n-grams which contain more than 2 stop words
    - window_size: How close toegther the n-grams should be, if 0 then the phrases are consecutive words

In [None]:
def get_common_phrases(results, use_trigrams = True, freq_filter = 3, result_count = 10, ignore_stops = False, window_size = 0):
    responses = list(results['Answer'].apply(lambda x: x.split()))
    words = [w for tokens in responses for w in tokens + ['BREAK']]
    measures = TrigramAssocMeasures() if use_trigrams else nltk.collocations.BigramAssocMeasures()
    f = TrigramCollocationFinder if use_trigrams else BigramCollocationFinder
    finder = f.from_words(words, window_size = window_size) if window_size > 2 else f.from_words(words)
    #finder = TrigramCollocationFinder.from_words(words) if use_trigrams else BigramCollocationFinder.from_words(words)
    if ignore_stops:
        ignored_words = nltk.corpus.stopwords.words('english')
        filter_stops = lambda w: len(w) < 2 or w in ignored_words 
        filter_ngram_stops = lambda *ngram: len([w for w in ngram if w not in ignored_words and len(w) > 2]) < 2
        # finder.apply_word_filter(filter_stops)
        finder.apply_ngram_filter(filter_ngram_stops) 
    response_count_filter = lambda *ngram: results[results['Answer'].str.contains(' '.join(ngram))]['Answer'].shape[0] < freq_filter
    print_filter = lambda *ngram: print(results[results['Answer'].str.contains(' '.join(ngram))]['Answer'].shape[0])
    finder.apply_ngram_filter(response_count_filter)
    finder.apply_word_filter(lambda w: w == 'BREAK')
    finder.apply_freq_filter(freq_filter)
    return finder.nbest(measures.pmi, result_count)


- as written, finds the most common bigrams by PMI in the question two responses and prints them in a LaTex friendly way

In [None]:
phrases = get_common_phrases(merged_results[merged_results['Question'] == Q2], use_trigrams = False, ignore_stops = True)
phrase_strings = [' '.join(phrase) for phrase in phrases]
for phrase_string in phrase_strings:
    print(f"{phrase_string}& {merged_results[merged_results['Answer'].str.contains(phrase_string)]['Answer'].shape[0]}\\\\")

- finds the 3 most common trigrams in each exercises responses and prints them in a LaTex friendly way

In [None]:
def print_latex_table(count):
    print('PMI Rank&', end = '')
    for problem in PROBLEMS:
        end_char = "\\\\\n" if problem == 'ps4' else '&'
        print(problem, end =end_char)

    for k in range(count):
        print(k+1, end = "&")
        for problem in PROBLEMS:
            phrases = get_common_phrases(q2_results[q2_results['Problem'] == problem], use_trigrams = True, ignore_stops = True, freq_filter = 3, window_size = 3, result_count = count)
            end_char = "\\\\\n" if problem == 'ps4' else '&'
            print(' '.join(phrases[k]), end = end_char)

print_latex_table(3)

- same as above but prints in a less LaTex friendly way

In [None]:
for problem in PROBLEMS:
    phrases = get_common_phrases(q2_results[q2_results['Problem'] == problem], use_trigrams = True, ignore_stops = True, freq_filter = 3, window_size = 3, result_count = 3)
    print(problem)
    [print(f"{k+1}. {' '.join(phrases[k])}") for k in range(len(phrases))]
    print('\n')

In [None]:
def preprocess_results(results):
    print(results.shape[0])
    results['Answer'] = results['Answer'].str.strip()
    results = results[~results['Answer'].isin(EXCLUDED_ANSWERS)]
    print(results.shape[0])
    results['english'] = results['Answer'].apply(is_english)
    results = results[results['english']]
    print(results.shape[0])
    results['Original'] = results['Answer']
    results['Answer'] = results['Answer'].apply(normalize)
    
    results['nonsense'] = results['Answer'].apply(is_nonsense)
    nonsense = results[results['nonsense']]
    print(nonsense.shape[0])
    print(nonsense[nonsense['Original'].str.split().str.len()<2].shape[0])
    results = results[~results['nonsense']]
    results['Manual Tag'] = 'no tag'
    print(results.shape[0])
    return results


unprocessed_results = get_problem_data(DATA)
complete_results = merge_problem_data(unprocessed_results)
extra_results = complete_results[complete_results['Question'] == extra_question]
extra_results = preprocess_results(extra_results)
# extra_results = extra_results[~(extra_results['Answer'] == 'Unanswered')]



- prints the highest ranking bigrams in response to question 7

In [None]:
phrases = get_common_phrases(extra_results, use_trigrams = False, ignore_stops = True, freq_filter = 3, window_size = 0, result_count = 20)
phrase_strings = [' '.join(phrase) for phrase in phrases]
[print(f"{k+1}. {phrase_strings[k]}") for k in range(len(phrases))]
[print(f"{k+1}&{phrase_strings[k]}\\\\") for k in range(11)]
print('\n')