In [2]:
import numpy as np
import pandas as pd
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

import re
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

import gensim
from gensim import corpora
from gensim.summarization.summarizer import summarize
import pickle


In [3]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

### Load training data

In [4]:
# load file
training_feedback = pd.read_csv('training_data/feedback_data_for_training.csv')

In [5]:
training_feedback

Unnamed: 0,source,feedback_text,topic,sub_topic,Unnamed: 4
0,team_generated,i have no idea how i’m doing so far. we’ve tur...,assignments,slow turnaround,
1,team_generated,"i’d ask for more time from the TAs, or try to ...",assignments,slow turnaround,
2,team_generated,the way that they encourage lots of class disc...,live session,good discussion,
3,team_generated,"async is going pretty well, but not super appl...",async,disconnected from live session,
4,team_generated,abe lincoln is the best professor i’ve ever had,instructors,,
5,team_generated,tech is bad,technology,negative,
6,team_generated,this is a great bot,technology,positive,
7,team_generated,"the breakout sessions were very engaging, and ...",live session,good discussion,
8,team_generated,one person dominated the full-class discussion...,live session,unbalanced,
9,team_generated,"they’re fine. problem sets are really helpful,...",assignments,slow turnaround,


In [6]:
topics = set(training_feedback.topic)
topics

{'assignments',
 'async',
 'instructors',
 'live session',
 'miscellaneous',
 'technology'}

### Create topic models


In [7]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

training_subset = training_feedback[(training_feedback['topic'] != 'misc')]

training_text = training_subset.feedback_text
training_targets = np.array(training_subset.topic)
training_text_clean = []
for text in training_text:
    training_text_clean.append(" ".join(prepare_text_for_lda(text)))  
training_text_clean = np.reshape(training_text_clean,(-1,))

In [8]:
training_targets

array(['assignments', 'assignments', 'live session', 'async',
       'instructors', 'technology', 'technology', 'live session',
       'live session', 'assignments', 'async', 'assignments',
       'live session', 'async', 'assignments', 'assignments',
       'live session', 'live session', 'async', 'assignments',
       'live session', 'live session', 'async', 'assignments', 'async',
       'assignments', 'live session', 'live session', 'async',
       'live session', 'async', 'technology', 'async', 'technology',
       'technology', 'technology', 'async', 'async', 'technology',
       'async', 'async', 'async', 'async', 'instructors', 'instructors',
       'assignments', 'assignments', 'technology', 'miscellaneous',
       'miscellaneous', 'async', 'miscellaneous', 'async', 'live session',
       'assignments', 'assignments', 'assignments', 'assignments',
       'assignments', 'assignments', 'assignments', 'assignments',
       'miscellaneous', 'miscellaneous', 'assignments', 'assignm

In [9]:
#train_x, test_x, train_y, test_y = model_selection.train_test_split(training_feedback['feedback_text'], 
train_x, test_x, train_y, test_y = model_selection.train_test_split(training_text_clean, 
                                                                      training_targets)

In [10]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((177,), (177,), (60,), (60,))

In [11]:
#train_x = train_x.reshape(-1, 1)
#test_x = test_x.reshape(-1, 1)
#train_y = train_y.reshape(-1, 1)
#test_y = test_y.reshape(-1, 1)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

train_x_counts = count_vect.fit_transform(train_x)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
train_x_tfidf = tfidf_transformer.fit_transform(train_x_counts)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_x_tfidf, train_y)

predicted = clf.predict(tfidf_transformer.transform(count_vect.transform(test_x)))
np.mean(predicted == test_y)


0.4166666666666667

In [13]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)  
             }

clf = MultinomialNB()
gs_clf = GridSearchCV(estimator=clf, param_grid=parameters, n_jobs=-1, iid=True)

gs_clf = gs_clf.fit(train_x_counts, train_y)

gs_clf.best_score_
gs_clf.best_params_

print("Optimal alpha = {}; best MNB score = {:.3}".format(gs_clf.best_params_, gs_clf.best_score_))




Optimal alpha = {'alpha': 1}; best MNB score = 0.503


In [14]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)  
             }

clf = MultinomialNB()
gs_clf = GridSearchCV(estimator=clf, param_grid=parameters, n_jobs=-1, iid=True)

gs_clf = gs_clf.fit(train_x_tfidf, train_y)

gs_clf.best_score_
gs_clf.best_params_

print("Optimal alpha = {}; best MNB score = {:.3}".format(gs_clf.best_params_, gs_clf.best_score_))

Optimal alpha = {'alpha': 0.1}; best MNB score = 0.525




In [15]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

# text_clf.fit(train_x, train_y)  
# predicted = text_clf.predict(test_x)
# np.mean(predicted == test_y) 

parameters = {
     'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
     'tfidf__use_idf': (True, False),
     'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001) ,
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(train_x, train_y)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.5309516238928004
{'clf__alpha': 0.1, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


In [16]:
model_filename = 'topic_models/full_topic_model.sav'
pickle.dump(gs_clf, open(model_filename, 'wb'))

### Create subtopic models, by topic

In [50]:
def generate_dictionary_corpus(data_id, full_text):
    # going to take 5-10 minutes
    
    feedback_tokenized = []
    
    for t in full_text:
        tokens = prepare_text_for_lda(str(t))
        feedback_tokenized.append(tokens)
        
    dictionary = corpora.Dictionary(feedback_tokenized)
    dictionary_filename = "topic_dictionaries/%s_dictionary_feedback.gensim" % (data_id)
    corpus = [dictionary.doc2bow(text) for text in feedback_tokenized]
    corpus_filename = "topic_corpi/%s_corpus_feedback.pkl" % (data_id)
    
    pickle.dump(corpus, open(corpus_filename, 'wb'))
    dictionary.save(dictionary_filename)
    return dictionary, corpus

def load_dictionary_corpus(data_id):
    dictionary_filename = "topic_dictionaries/%s_dictionary_feedback.gensim" % (data_id)
    dictionary = corpora.Dictionary.load(dictionary_filename)
    corpus_filename = "topic_corpi/%s_corpus_feedback.pkl" % (data_id)
    with open(corpus_filename, 'rb') as f:
        corpus = pickle.load(f)
    if (len(dictionary) == 0):
        return
    return dictionary, corpus
    
def generate_lda(data_id, corpus, dictionary, num_topics, num_passes, num_words):
    # runtime will depend on number of passes
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=num_passes)
    ldamodel_filename = 'topic_models/%s_model%st_%sp.gensim' % (data_id, str(num_topics), str(num_passes))
    ldamodel.save(ldamodel_filename)
    
    sub_topics = ldamodel.print_topics(num_words=num_words)
    for sub_topic in sub_topics:
        print(sub_topic)
        
    return ldamodel

In [49]:
for topic in topics: 
    print ("Processing", topic)
    topic_subset = training_feedback[training_feedback.topic==topic]
    topic_text = topic_subset.feedback_text
    
    dictionary, corpus = generate_dictionary_corpus(topic, topic_text)

Processing miscellaneous
Processing assignments
Processing technology
Processing async
Processing instructors
Processing live session


In [18]:
for topic in topics:
    print ("Processing", topic)
    topic_subset = training_feedback[training_feedback.topic==topic]
    topic_text = topic_subset.feedback_text
    # print(text)
    
    chats_tokenized = []
    
    for t in topic_text:
        tokens = prepare_text_for_lda(t)
        chats_tokenized.append(tokens)
        
    dictionary = corpora.Dictionary(chats_tokenized)
    dictionary_filename = "topic_dictionaries/%s_dictionary_feedback.gensim" % (str(topic))
    corpus = [dictionary.doc2bow(text) for text in chats_tokenized]
    corpus_filename = "topic_corpi/%s_corpus_feedback.pkl" % (str(topic))
    

    pickle.dump(corpus, open(corpus_filename, 'wb'))
    dictionary.save(dictionary_filename)

Processing miscellaneous
Processing assignments
Processing technology
Processing async
Processing instructors
Processing live session


In [51]:
NUM_TOPICS = 3
NUM_PASSES = 15
NUM_WORDS = 5

for topic in topics:
    print(topic)
    dictionary, corpus = load_dictionary_corpus(topic)
    generate_lda(topic, corpus, dictionary, NUM_TOPICS, NUM_PASSES, NUM_WORDS)

miscellaneous
(0, '0.050*"reading" + 0.035*"definitely" + 0.020*"class" + 0.020*"longer" + 0.020*"portion"')
(1, '0.027*"assignment" + 0.027*"material" + 0.027*"still" + 0.027*"really" + 0.027*"system"')
(2, '0.041*"light" + 0.029*"assume" + 0.029*"workload" + 0.029*"pretty" + 0.029*"anything"')
assignments
(0, '0.027*"assignment" + 0.020*"feedback" + 0.019*"project" + 0.016*"reading" + 0.013*"class"')
(1, '0.036*"feedback" + 0.035*"assignment" + 0.029*"grade" + 0.018*"instructor" + 0.015*"semester"')
(2, '0.048*"assignment" + 0.031*"feedback" + 0.030*"grade" + 0.016*"really" + 0.014*"get"')
technology
(0, '0.016*"question" + 0.016*"great" + 0.011*"reason" + 0.011*"better" + 0.011*"success"')
(1, '0.023*"mobile" + 0.017*"quality" + 0.017*"stuff" + 0.017*"except" + 0.012*"laptop"')
(2, '0.024*"class" + 0.020*"issue" + 0.017*"video" + 0.017*"would" + 0.017*"could"')
async
(0, '0.028*"async" + 0.020*"video" + 0.020*"class" + 0.020*"found" + 0.020*"design"')
(1, '0.085*"async" + 0.021*"pre

In [52]:
NUM_TOPICS = 3
NUM_PASSES = 15
NUM_WORDS = 5

for topic in topics:
    print(topic)
    dictionary, corpus = load_dictionary_corpus(topic)
    generate_lda(topic, corpus, dictionary, NUM_TOPICS, NUM_PASSES, NUM_WORDS)

miscellaneous
(0, '0.032*"light" + 0.032*"really" + 0.022*"could" + 0.022*"anything" + 0.022*"assume"')
(1, '0.048*"class" + 0.034*"system" + 0.034*"impossible" + 0.019*"still" + 0.019*"depend"')
(2, '0.040*"reading" + 0.039*"definitely" + 0.022*"longer" + 0.022*"portion" + 0.022*"series"')
assignments
(0, '0.032*"feedback" + 0.023*"grade" + 0.023*"project" + 0.022*"might" + 0.021*"instructor"')
(1, '0.025*"assignment" + 0.025*"really" + 0.022*"problem" + 0.019*"material" + 0.017*"grade"')
(2, '0.061*"assignment" + 0.043*"feedback" + 0.017*"grade" + 0.012*"class" + 0.012*"spend"')
technology
(0, '0.020*"mobile" + 0.015*"laptop" + 0.015*"except" + 0.011*"great" + 0.010*"question"')
(1, '0.026*"video" + 0.016*"specific" + 0.016*"quality" + 0.016*"great" + 0.011*"mobile"')
(2, '0.023*"class" + 0.020*"issue" + 0.015*"would" + 0.015*"instructor" + 0.015*"session"')
async
(0, '0.045*"async" + 0.039*"class" + 0.034*"session" + 0.027*"discussion" + 0.021*"really"')
(1, '0.053*"async" + 0.028*"

In [53]:
NUM_TOPICS = 3
NUM_PASSES = 25
NUM_WORDS = 3

for topic in topics:
    print(topic)
    dictionary, corpus = load_dictionary_corpus(topic)
    generate_lda(topic, corpus, dictionary, NUM_TOPICS, NUM_PASSES, NUM_WORDS)

miscellaneous
(0, '0.054*"class" + 0.036*"reading" + 0.036*"assignment"')
(1, '0.041*"light" + 0.041*"really" + 0.028*"speed"')
(2, '0.027*"system" + 0.027*"speaker" + 0.016*"around"')
assignments
(0, '0.022*"grade" + 0.022*"quite" + 0.021*"problem"')
(1, '0.048*"feedback" + 0.029*"assignment" + 0.023*"grade"')
(2, '0.051*"assignment" + 0.026*"feedback" + 0.022*"project"')
technology
(0, '0.023*"issue" + 0.014*"student" + 0.014*"record"')
(1, '0.030*"class" + 0.018*"watch" + 0.013*"video"')
(2, '0.018*"mobile" + 0.018*"great" + 0.014*"would"')
async
(0, '0.037*"class" + 0.029*"reading" + 0.029*"discussion"')
(1, '0.023*"async" + 0.021*"video" + 0.021*"watch"')
(2, '0.091*"async" + 0.030*"session" + 0.027*"really"')
instructors
(0, '0.017*"slack" + 0.013*"instructor" + 0.013*"student"')
(1, '0.018*"class" + 0.014*"sessions" + 0.014*"focus"')
(2, '0.042*"class" + 0.020*"really" + 0.015*"office"')
live session
(0, '0.035*"sessions" + 0.034*"sync" + 0.024*"material"')
(1, '0.041*"discussio

In [54]:
NUM_TOPICS = 3
NUM_PASSES = 25
NUM_WORDS = 2

for topic in topics:
    print(topic)
    dictionary, corpus = load_dictionary_corpus(topic)
    generate_lda(topic, corpus, dictionary, NUM_TOPICS, NUM_PASSES, NUM_WORDS)

miscellaneous
(0, '0.053*"really" + 0.037*"reading"')
(1, '0.034*"system" + 0.019*"still"')
(2, '0.043*"class" + 0.023*"material"')
assignments
(0, '0.044*"assignment" + 0.032*"grade"')
(1, '0.030*"assignment" + 0.026*"reading"')
(2, '0.043*"feedback" + 0.033*"assignment"')
technology
(0, '0.022*"class" + 0.022*"video"')
(1, '0.036*"issue" + 0.013*"quality"')
(2, '0.018*"except" + 0.018*"laptop"')
async
(0, '0.056*"async" + 0.042*"class"')
(1, '0.086*"async" + 0.036*"really"')
(2, '0.032*"found" + 0.032*"actually"')
instructors
(0, '0.025*"class" + 0.013*"people"')
(1, '0.021*"instructor" + 0.021*"office"')
(2, '0.041*"class" + 0.016*"explain"')
live session
(0, '0.040*"discussion" + 0.033*"session"')
(1, '0.040*"sessions" + 0.034*"class"')
(2, '0.032*"discussion" + 0.025*"really"')


### Create subtopic models, separate from topic


In [None]:
topic_subset = training_feedback[training_feedback.topic==topic]
topic_text = topic_subset.feedback_text
    
dictionary, corpus = generate_dictionary_corpus(topic, topic_text)

In [55]:
topic_text = training_feedback.feedback_text

mids_dictionary, mids_corpus = generate_dictionary_corpus("mids", topic_text)

In [56]:
NUM_TOPICS = 10
NUM_PASSES = 15
NUM_WORDS = 2

# runtime will depend on number of passes
dictionary, corpus = load_dictionary_corpus(topic)
generate_lda("mids", corpus, dictionary, NUM_TOPICS, NUM_PASSES, NUM_WORDS)

(0, '0.051*"sometimes" + 0.027*"session"')
(1, '0.061*"hands" + 0.061*"sessions"')
(2, '0.052*"sleep" + 0.052*"means"')
(3, '0.006*"concept" + 0.006*"lead"')
(4, '0.040*"schedule" + 0.040*"apply"')
(5, '0.048*"topic" + 0.032*"though"')
(6, '0.055*"sync" + 0.055*"sessions"')
(7, '0.047*"tough" + 0.047*"discussion"')
(8, '0.071*"discussion" + 0.036*"enjoy"')
(9, '0.117*"class" + 0.060*"discussion"')


<gensim.models.ldamodel.LdaModel at 0x1a29939048>

In [57]:
NUM_TOPICS = 10
NUM_PASSES = 10
NUM_WORDS = 2

# runtime will depend on number of passes
dictionary, corpus = load_dictionary_corpus(topic)
generate_lda("mids", corpus, dictionary, NUM_TOPICS, NUM_PASSES, NUM_WORDS)

(0, '0.048*"always" + 0.025*"sometimes"')
(1, '0.091*"class" + 0.047*"really"')
(2, '0.068*"discussion" + 0.068*"session"')
(3, '0.076*"sessions" + 0.076*"sync"')
(4, '0.058*"apply" + 0.031*"enjoy"')
(5, '0.085*"discussion" + 0.044*"feel"')
(6, '0.006*"class" + 0.006*"question"')
(7, '0.073*"sessions" + 0.038*"discussion"')
(8, '0.040*"concept" + 0.022*"topic"')
(9, '0.055*"session" + 0.037*"interest"')


<gensim.models.ldamodel.LdaModel at 0x1a299397b8>

In [39]:
from operator import itemgetter

def pred_sub_topic(text):
    
    # predict subtopic
    tokens = prepare_text_for_lda(str(text))
    # print(tokens)
    bow_tokens = dictionary.doc2bow(tokens)
    vector = ldamodel[bow_tokens]
    # print(vector)
    most_likely = max(vector,key=itemgetter(1))

    if (most_likely[1] < .11):
        # print("No best predictor")
        return None
    return most_likely[0]

In [43]:
for i in range(len(test_x)):
    print(pred_sub_topic(test_x[i]), ",", test_x[i])

4 , engage
5 , going homework answer session help sometimes unclear technical clear provide confirmation specific formatting expression explanation rationale write assignment
7 , really struggle weekly assignment assignment twice instructor worry understand material
None , need muuuuch slow
3 , teacher practical apply focus rather theory session active discussion format class
1 , making effective know stuff
6 , assignment nasty worthwhile instructions reading hint actual coding
2 , implementation problem project
0 , found decade article worthless personally found reading worthwhile still graduate level class something
None , skeptical concern attitude towards student
6 , right pretty light workload point assume anything could grade
None , subtitle image small laptop mobile except native browser slide
3 , instruction preparation session could otherwise spend asking interest challenge question topic
8 , reading realistically skimming
0 , expectation somewhat stifle student propensity que

In [44]:
# kmeans clustering

NUM_TOPICS = 10
NUM_PASSES = 10
NUM_WORDS = 2


from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

train_x_counts = count_vect.fit_transform(train_x)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
train_x_tfidf = tfidf_transformer.fit_transform(train_x_counts)

from sklearn.cluster import KMeans
km_model = KMeans(n_clusters=NUM_TOPICS, init='k-means++', max_iter=200, n_init=100)
km_model.fit(train_x_tfidf)

predicted = km_model.predict(tfidf_transformer.transform(count_vect.transform(test_x)))
#print(predicted)
#print(test_x)

for i in range(len(predicted)):
    print(predicted[i], ",", test_x[i])

0 , engage
0 , going homework answer session help sometimes unclear technical clear provide confirmation specific formatting expression explanation rationale write assignment
0 , really struggle weekly assignment assignment twice instructor worry understand material
0 , need muuuuch slow
8 , teacher practical apply focus rather theory session active discussion format class
0 , making effective know stuff
1 , assignment nasty worthwhile instructions reading hint actual coding
3 , implementation problem project
1 , found decade article worthless personally found reading worthwhile still graduate level class something
2 , skeptical concern attitude towards student
3 , right pretty light workload point assume anything could grade
0 , subtitle image small laptop mobile except native browser slide
9 , instruction preparation session could otherwise spend asking interest challenge question topic
1 , reading realistically skimming
4 , expectation somewhat stifle student propensity question cla

### Generate topic summaries

In [45]:
for topic in topics:
    print ("Processing", topic)
    topic_subset = training_feedback[training_feedback.topic==topic]
    topic_text = topic_subset.feedback_text
    
    s = '. '
    text = s.join(topic_text)
    text = re.sub("\.+", ".", text)
    text = re.sub("\n", "", text)
    # print(text)
    print(summarize(text))
    print('\n')

Processing miscellaneous
The week off for fall immersion seemed unexpected to both of my professors when it came to planning class material/projects/assignments.
I’m finding w203 challenging re: speed/number of concepts to consume (I really want to internalize/grasp them but feel we’re moving at light speed).
Good luck, you're gonna need it.
There's a huge amount of reading just to understand what's going on


Processing assignments
we’ve turned in four assignments so far, but i haven’t received feedback on ANY of them.
i’d ask for more time from the TAs, or try to bring  on some additional TAs so they can turn around assignments faster.
problem sets are really helpful, but we haven’t gotten any feedback on them so far so i’m not sure how well i’m actually doing on these.
i feel like we could use an extra TA - it would really help with the problem set feedback turnaround.
still haven’t gotten any feedback on assignments so far, which is frustrating.
they’re pretty straightforward, thou

lots of lively discussion, which made class go by really fast, which was nice.
it was good - it ran half an hour over, which was tough, but the discussion was fun so it didn’t seem like it took as long as it did.
it was good - it ran half an hour over, which was tough, but the discussion was fun so it didn’t seem like it took as long as it did.
live sessions have been running late recently, which has been really tough on my schedule (i’m in a different time zone, so going late means cutting into my sleep).
The problem with this, is that the instruction team has to do a TON of preparation work, and lost live session time which could otherwise be spent on asking more interesting or challenging questions about the topic.
the instruction team has to do a TON of preparation work, and lost live session time which could otherwise be spent on asking more interesting or challenging questions about the topic.
The materials are VERY synced up between different live sessions up to the second last 