In [1]:
import pickle
import nltk
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from nltk.collocations import *
from nltk.metrics import TrigramAssocMeasures 
from nltk.corpus import stopwords
from sklearn.svm import SVC

from graphing import *
from constants import *
from utilities import *
from preprocess import *

[nltk_data] Downloading package wordnet to
[nltk_data]     /yw_data/robert_gold/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
merged_results = pickle.load(open('merged_results.pickle', 'rb'))
merged_results['Valid Vector'] = merged_results['Skip Thought Vector'].apply(lambda x: ~np.isnan(x).any())
removed_results = merged_results[~merged_results['Valid Vector']]
merged_results = merged_results[merged_results['Valid Vector']]
labeled_results = merged_results[merged_results['Categorical Tag'] != 'no tag']
q1_results = merged_results[merged_results['Question'] == Q1]
q1_labeled_results = labeled_results[labeled_results['Question'] == Q1]

In [3]:
q1_features = np.array(q1_labeled_results['Skip Thought Vector'].tolist())
q1_labels = np.array(q1_labeled_results['Categorical Tag'])
clf = SVC()
clf.fit(q1_features, q1_labels)
predictions = clf.predict(list(q1_results['Skip Thought Vector']))
q1_results['sentiment'] = predictions
q2_results = merged_results[merged_results['Question'] == Q2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [4]:
def get_common_phrases(results, use_trigrams = True, freq_filter = 3, result_count = 10, ignore_stops = False, window_size = 0):
    responses = list(results['Answer'].apply(lambda x: x.split()))
    words = [w for tokens in responses for w in tokens + ['BREAK']]
    measures = TrigramAssocMeasures() if use_trigrams else nltk.collocations.BigramAssocMeasures()
    f = TrigramCollocationFinder if use_trigrams else BigramCollocationFinder
    finder = f.from_words(words, window_size = window_size) if window_size > 2 else f.from_words(words)
    #finder = TrigramCollocationFinder.from_words(words) if use_trigrams else BigramCollocationFinder.from_words(words)
    if ignore_stops:
        ignored_words = nltk.corpus.stopwords.words('english')
        filter_stops = lambda w: len(w) < 2 or w in ignored_words 
        filter_ngram_stops = lambda *ngram: len([w for w in ngram if w not in ignored_words and len(w) > 2]) < 2
        # finder.apply_word_filter(filter_stops)
        finder.apply_ngram_filter(filter_ngram_stops) 
    response_count_filter = lambda *ngram: results[results['Answer'].str.contains(' '.join(ngram))]['Answer'].shape[0] < freq_filter
    print_filter = lambda *ngram: print(results[results['Answer'].str.contains(' '.join(ngram))]['Answer'].shape[0])
    finder.apply_ngram_filter(response_count_filter)
    finder.apply_word_filter(lambda w: w == 'BREAK')
    finder.apply_freq_filter(freq_filter)
    return finder.nbest(measures.pmi, result_count)


In [5]:
phrases = get_common_phrases(merged_results[merged_results['Question'] == Q2], use_trigrams = False, ignore_stops = True)
phrase_strings = [' '.join(phrase) for phrase in phrases]
for phrase_string in phrase_strings:
    print(f"{phrase_string}& {merged_results[merged_results['Answer'].str.contains(phrase_string)]['Answer'].shape[0]}\\\\")

anal zed& 3\\
eric sir& 3\\
square root& 6\\
stack overflow& 5\\
truth table& 6\\
glass box& 10\\
pay attention& 6\\
critical thinking& 3\\
straight forward& 14\\
close enough& 4\\


In [6]:
def print_latex_table(count):
    print('PMI Rank&', end = '')
    for problem in PROBLEMS:
        end_char = "\\\\\n" if problem == 'ps4' else '&'
        print(problem, end =end_char)

    for k in range(count):
        print(k+1, end = "&")
        for problem in PROBLEMS:
            phrases = get_common_phrases(q2_results[q2_results['Problem'] == problem], use_trigrams = True, ignore_stops = True, freq_filter = 3, window_size = 3, result_count = count)
            end_char = "\\\\\n" if problem == 'ps4' else '&'
            print(' '.join(phrases[k]), end = end_char)

print_latex_table(3)

PMI Rank&fex1&fex2&fex4&ps1&ps2&ps4\\
1&leave to right&step by step&glass box test&line by line&into small piece&piece by piece\\
2&evaluate each part&follow the instruction&use python tutor&problem into small&an infinite loop&play game function\\
3&trial and error&from previous video&the discussion thread&problem solve process&use python tutor&this problem set\\


In [7]:
for problem in PROBLEMS:
    phrases = get_common_phrases(q2_results[q2_results['Problem'] == problem], use_trigrams = True, ignore_stops = True, freq_filter = 3, window_size = 3, result_count = 3)
    print(problem)
    [print(f"{k+1}. {' '.join(phrases[k])}") for k in range(len(phrases))]
    print('\n')

fex1
1. leave to right
2. evaluate each part
3. trial and error


fex2
1. step by step
2. follow the instruction
3. from previous video


fex4
1. glass box test
2. use python tutor
3. the discussion thread


ps1
1. line by line
2. problem into small
3. problem solve process


ps2
1. into small piece
2. an infinite loop
3. use python tutor


ps4
1. piece by piece
2. play game function
3. this problem set




In [8]:
for sentiment in SENTIMENTS:
    phrases = get_common_phrases(q1_results[q1_results['sentiment'] == sentiment], use_trigrams = False, ignore_stops = True, freq_filter = 3, window_size = 0)
    print(sentiment)
    [print(f"{k+1}. {' '.join(phrases[k])}") for k in range(len(phrases))]
    print('\n')

positive
1. look forward
2. little bit
3. well prepared
4. problem set
5. feel like
6. pretty good
7. nice exercise
8. good job
9. good work
10. interesting exercise


neutral
1. right answer
2. additional feedback


negative
1. discussion thread
2. similarity checker
3. secret number
4. even though
5. click submit
6. prior knowledge
7. helper function
8. discussion section
9. print statement
10. test case




In [9]:
def preprocess_results(results):
    print(results.shape[0])
    results['Answer'] = results['Answer'].str.strip()
    results = results[~results['Answer'].isin(EXCLUDED_ANSWERS)]
    print(results.shape[0])
    results['english'] = results['Answer'].apply(is_english)
    results = results[results['english']]
    print(results.shape[0])
    results['Original'] = results['Answer']
    results['Answer'] = results['Answer'].apply(normalize)
    
    results['nonsense'] = results['Answer'].apply(is_nonsense)
    nonsense = results[results['nonsense']]
    print(nonsense.shape[0])
    print(nonsense[nonsense['Original'].str.split().str.len()<2].shape[0])
    results = results[~results['nonsense']]
    results['Manual Tag'] = 'no tag'
    print(results.shape[0])
    return results


unprocessed_results = get_problem_data(DATA)
complete_results = merge_problem_data(unprocessed_results)
extra_results = complete_results[complete_results['Question'] == extra_question]
extra_results = preprocess_results(extra_results)
# extra_results = extra_results[~(extra_results['Answer'] == 'Unanswered')]



3614
542
540


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


107
107
433


In [10]:
phrases = get_common_phrases(extra_results, use_trigrams = False, ignore_stops = True, freq_filter = 3, window_size = 0, result_count = 20)
phrase_strings = [' '.join(phrase) for phrase in phrases]
[print(f"{k+1}. {phrase_strings[k]}") for k in range(len(phrases))]
[print(f"{k+1}&{phrase_strings[k]}\\\\") for k in range(11)]
print('\n')

1. git hub
2. stack overflow
3. function call
4. pythontutor com
5. bisection search
6. problem set
7. recommend textbook
8. text book
9. google search
10. program use
11. python tutor
12. learn python
13. previous python
14. python knowledge
15. use python
16. course textbook
17. course book
18. python book
19. python course
1&git hub\\
2&stack overflow\\
3&function call\\
4&pythontutor com\\
5&bisection search\\
6&problem set\\
7&recommend textbook\\
8&text book\\
9&google search\\
10&program use\\
11&python tutor\\




In [11]:
for phrase_string in phrase_strings:
    print(f"{phrase_string}: {extra_results[extra_results['Answer'].str.contains(phrase_string)]['Answer'].shape[0]}")

git hub: 3
stack overflow: 8
function call: 3
pythontutor com: 6
bisection search: 4
problem set: 3
recommend textbook: 3
text book: 3
google search: 5
program use: 4
python tutor: 13
learn python: 3
previous python: 4
python knowledge: 4
use python: 6
course textbook: 3
course book: 3
python book: 3
python course: 3


In [12]:
n_clusters = 10
reduced_q2_encodings = reduce_dimensions(list(q2_results['Skip Thought Vector']))
agg_labels, model = get_agg_clusters(list(q2_results['Skip Thought Vector']), n_clusters = n_clusters)


In [13]:
q2_results['Agg Label'] = agg_labels
for k in range (n_clusters):
    phrases = get_common_phrases(q2_results[q2_results['Agg Label'] == k], use_trigrams = True, ignore_stops = True, freq_filter = 3, window_size = 3, result_count = 3)
    [print(f"{k+1}. {' '.join(phrases[k])}") for k in range(len(phrases))]
    print('\n')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


1. pretty straight forward
2. glass box test
3. into small piece


1. step by step
2. follow the instruction
3. read the instruction


1. divide and conquer
2. trial and error
3. line by line




1. use python tutor
2. piece by piece
3. step by step


1. step by step
2. line of code
3. type the code




1. trail and error
2. trial and error
3. try and error






