In [2]:
import numpy as np
import pandas as pd
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

import re
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

import gensim
from gensim import corpora
import pickle

In [3]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [4]:
# load file
training_feedback = pd.read_csv('training_data/feedback_data_for_training.csv')

In [5]:
training_feedback

Unnamed: 0,source,feedback_text,topic,sub_topic
0,team_generated,i have no idea how i’m doing so far. we’ve tur...,feedback,slow turnaround
1,team_generated,"i’d ask for more time from the TAs, or try to ...",feedback,slow turnaround
2,team_generated,the way that they encourage lots of class disc...,live session,good discussion
3,team_generated,"async is going pretty well, but not super appl...",async,disconnected from live session
4,team_generated,abe lincoln is the best professor i’ve ever had,instructor,
5,team_generated,tech is bad,tech,negative
6,team_generated,this is a great bot,tech,positive
7,team_generated,"pretty well, actually! i think i’m finally get...",course material,understandable
8,team_generated,"the breakout sessions were very engaging, and ...",live session,good discussion
9,team_generated,one person dominated the full-class discussion...,live session,unbalanced


In [9]:
topics = set(training_feedback.topic)
topics

{'assignments',
 'async',
 'course material',
 'feedback',
 'instructor',
 'instructors',
 'live session',
 'misc',
 nan,
 'office hours',
 'tech'}

In [37]:
for topic in topics:
    print ("Processing", topic)
    topic_subset = training_feedback[training_feedback.topic==topic]
    topic_text = topic_subset.feedback_text
    # print(text)
    
    chats_tokenized = []
    
    for t in topic_text:
        tokens = prepare_text_for_lda(t)
        chats_tokenized.append(tokens)
        
    dictionary = corpora.Dictionary(chats_tokenized)
    dictionary_filename = "topic_dictionaries/%s_dictionary_feedback.gensim" % (str(topic))
    corpus = [dictionary.doc2bow(text) for text in chats_tokenized]
    corpus_filename = "topic_corpi/%s_corpus_feedback.pkl" % (str(topic))
    

    pickle.dump(corpus, open(corpus_filename, 'wb'))
    dictionary.save(dictionary_filename)

Processing nan
Processing misc
Processing instructors
Processing feedback
Processing tech
Processing async
Processing assignments
Processing instructor
Processing office hours
Processing live session
Processing course material


In [44]:
NUM_TOPICS = 3
NUM_PASSES = 15

for topic in topics:
    print(topic)
    dictionary_filename = "topic_dictionaries/%s_dictionary_feedback.gensim" % (str(topic))
    dictionary = corpora.Dictionary.load(dictionary_filename)
    corpus_filename = "topic_corpi/%s_corpus_feedback.pkl" % (str(topic))
    with open(corpus_filename, 'rb') as f:
        corpus = pickle.load(f)
    if (len(dictionary) == 0):
        continue
    
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=NUM_PASSES)
    ldamodel_filename = 'topic_models/feedback_model%st_%sp.gensim' % (str(NUM_TOPICS), str(NUM_PASSES))
    ldamodel.save(ldamodel_filename)
    
    sub_topics = ldamodel.print_topics(num_words=5)
    for sub_topic in sub_topics:
        print(sub_topic)
    
    


nan
misc
(0, '0.045*"upcoming" + 0.045*"appear" + 0.045*"understaffed" + 0.045*"badly" + 0.045*"program"')
(1, '0.049*"reading" + 0.028*"right" + 0.028*"grade" + 0.028*"light" + 0.028*"pretty"')
(2, '0.029*"system" + 0.029*"speaker" + 0.029*"still" + 0.029*"material" + 0.029*"assignment"')
instructors
(0, '0.042*"class" + 0.030*"background" + 0.030*"engage" + 0.017*"instructor" + 0.017*"great"')
(1, '0.032*"class" + 0.032*"really" + 0.022*"material" + 0.022*"intuition" + 0.022*"focus"')
(2, '0.029*"responsive" + 0.029*"people" + 0.029*"sessions" + 0.028*"instructor" + 0.017*"prefer"')
feedback
(0, '0.058*"grade" + 0.042*"feedback" + 0.037*"assignment" + 0.032*"instructor" + 0.022*"though"')
(1, '0.057*"feedback" + 0.026*"grade" + 0.021*"get" + 0.021*"assignment" + 0.020*"might"')
(2, '0.065*"feedback" + 0.034*"semester" + 0.026*"receive" + 0.026*"presentation" + 0.018*"frustrate"')
tech
(0, '0.046*"video" + 0.046*"teacher" + 0.046*"watch" + 0.026*"mobile" + 0.026*"async"')
(1, '0.046*"

In [78]:
dictionary = load_from_text('dictionary_feedback.gensim')

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=NUM_PASSES)
ldamodel_filename = 'topic_models/feedback_model%st_%sp.gensim' % (str(NUM_TOPICS), str(NUM_PASSES))
ldamodel.save(ldamodel_filename)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.028*"async" + 0.026*"really" + 0.018*"discussion" + 0.017*"class"')
(1, '0.026*"really" + 0.022*"assignment" + 0.021*"async" + 0.020*"class"')
(2, '0.033*"async" + 0.016*"class" + 0.013*"material" + 0.012*"session"')
(3, '0.038*"class" + 0.026*"really" + 0.014*"question" + 0.012*"great"')
(4, '0.036*"assignment" + 0.028*"feedback" + 0.021*"might" + 0.016*"grade"')


In [79]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10)
ldamodel.save('feedback_model5t_10p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.016*"material" + 0.016*"question" + 0.014*"slack" + 0.013*"might"')
(1, '0.044*"class" + 0.025*"async" + 0.018*"discussion" + 0.016*"really"')
(2, '0.030*"async" + 0.027*"really" + 0.026*"assignment" + 0.020*"class"')
(3, '0.027*"really" + 0.017*"async" + 0.014*"class" + 0.013*"great"')
(4, '0.045*"feedback" + 0.030*"semester" + 0.024*"grade" + 0.016*"assignment"')


In [80]:
NUM_TOPICS = 3
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('feedback_model3t_20p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.032*"class" + 0.029*"async" + 0.022*"session" + 0.013*"feedback"')
(1, '0.016*"async" + 0.015*"assignment" + 0.014*"material" + 0.014*"really"')
(2, '0.026*"really" + 0.020*"assignment" + 0.018*"feedback" + 0.015*"grade"')


In [85]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
#ldamodel.save('feedback_model4t_20p_6w.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.017*"really" + 0.015*"would" + 0.012*"question" + 0.012*"interest" + 0.009*"might"')
(1, '0.049*"class" + 0.032*"async" + 0.027*"really" + 0.017*"session" + 0.014*"material"')
(2, '0.019*"async" + 0.015*"question" + 0.013*"quality" + 0.013*"really" + 0.010*"pretty"')
(3, '0.027*"class" + 0.026*"assignment" + 0.018*"video" + 0.012*"async" + 0.012*"really"')
(4, '0.041*"feedback" + 0.033*"assignment" + 0.024*"grade" + 0.018*"semester" + 0.016*"instructor"')
