In [59]:
import numpy as np
import pandas as pd
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

import re
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

import gensim
from gensim import corpora
import pickle

In [2]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [42]:
# load file
chats = pd.read_csv('student_feedback.csv')

In [43]:
chats

Unnamed: 0,feedback_text
0,i have no idea how i’m doing so far. we’ve tur...
1,"i’d ask for more time from the TAs, or try to ..."
2,the way that they encourage lots of class disc...
3,"async is going pretty well, but not super appl..."
4,abe lincoln is the best professor i’ve ever had
5,tech is bad
6,this is a great bot
7,"pretty well, actually! i think i’m finally get..."
8,"the breakout sessions were very engaging, and ..."
9,one person dominated the full-class discussion...


In [54]:
chats=chats.feedback_text

In [55]:
chats[1]

'i’d ask for more time from the TAs, or try to bring  on some additional TAs so they can turn around assignments faster'

In [56]:
chats_tokenized = []
# preprocessing - WILL TAKE 30+ MINS

for i in range(len(chats)):
    tokens = prepare_text_for_lda(str(chats[i]))
    chats_tokenized.append(tokens)
    
chats_tokenized

[['turn',
  'assignment',
  'receive',
  'feedback',
  'super',
  'frustrate',
  'might',
  'great',
  'might',
  'failing'],
 ['bring', 'additional', 'around', 'assignment', 'fast'],
 ['encourage', 'class', 'discussion', 'really'],
 ['async', 'going', 'pretty', 'super', 'applicable', 'session', 'discussion'],
 ['lincoln', 'professor'],
 [],
 ['great'],
 ['pretty', 'actually', 'think', 'finally', 'getting', 'things'],
 ['breakout',
  'sessions',
  'engage',
  'really',
  'enjoy',
  'discussion',
  'question'],
 ['person', 'dominate', 'class', 'discussion', 'annoying', 'overall', 'class'],
 ['problem', 'really', 'helpful', 'get', 'feedback', 'actually'],
 ['async', 'superfluous', 'though', 'really', 'anything', 'talk', 'session'],
 ['could',
  'extra',
  'would',
  'really',
  'problem',
  'feedback',
  'turnaround',
  'things',
  'going',
  'though'],
 ['class', 'going', 'feeling', 'little', 'material'],
 ['session', 'lively', 'discussion', 'class', 'really'],
 ['async',
  'pretty',
  

In [57]:
s = " "
clean_text = s.join(chats)
# clean_text

'i have no idea how i’m doing so far. we’ve turned in four assignments so far, but i haven’t received feedback on ANY of them. it’s super frustrating. i might be doing great, but i might be failing. i’d ask for more time from the TAs, or try to bring  on some additional TAs so they can turn around assignments faster the way that they encourage lots of class discussion is really nice async is going pretty well, but not super applicable to the live session discussion abe lincoln is the best professor i’ve ever had tech is bad this is a great bot pretty well, actually! i think i’m finally getting the hang of things. the breakout sessions were very engaging, and i really enjoyed the discussion questions one person dominated the full-class discussion, which was annoying. but overall it was a good class they’re fine. problem sets are really helpful, but we haven’t gotten any feedback on them so far so i’m not sure how well i’m actually doing on these. async was okay. it felt superfluous thou

In [60]:
text = nlp(clean_text)
items = [x.text for x in text.ents]
Counter(items).most_common(10)

[('one', 7),
 (' ', 5),
 ('first', 4),
 ('241', 3),
 ('this week', 2),
 ('MIDS', 2),
 ('TA', 2),
 ('CNN', 2),
 ('201', 2),
 ('205', 2)]

In [61]:
# 5-10 mins
dictionary = corpora.Dictionary(chats_tokenized)
corpus = [dictionary.doc2bow(text) for text in chats_tokenized]

pickle.dump(corpus, open('corpus_feedback.pkl', 'wb'))
dictionary.save('dictionary_feedback.gensim')

In [62]:
# dictionary = load_from_text('dictionary_feedback.gensim')

In [63]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=5)
ldamodel.save('feedback_model5t_5p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.029*"session" + 0.025*"really" + 0.025*"async" + 0.016*"discussion"')
(1, '0.024*"really" + 0.024*"would" + 0.024*"extra" + 0.016*"going"')
(2, '0.030*"instructor" + 0.030*"assignment" + 0.026*"feedback" + 0.022*"grade"')
(3, '0.034*"really" + 0.029*"async" + 0.023*"feedback" + 0.020*"assignment"')
(4, '0.027*"class" + 0.022*"async" + 0.022*"grade" + 0.017*"though"')


In [64]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10)
ldamodel.save('feedback_model5t_10p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.029*"discussion" + 0.019*"class" + 0.019*"really" + 0.015*"specific"')
(1, '0.025*"session" + 0.025*"async" + 0.020*"though" + 0.020*"actually"')
(2, '0.035*"assignment" + 0.024*"async" + 0.023*"feedback" + 0.020*"really"')
(3, '0.029*"feedback" + 0.027*"class" + 0.025*"really" + 0.023*"might"')
(4, '0.012*"super" + 0.012*"material" + 0.012*"things" + 0.012*"understanding"')


In [65]:
NUM_TOPICS = 3
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('feedback_model3t_20p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.030*"async" + 0.022*"class" + 0.020*"session" + 0.017*"really"')
(1, '0.030*"feedback" + 0.021*"grade" + 0.018*"actually" + 0.018*"really"')
(2, '0.025*"discussion" + 0.023*"really" + 0.023*"async" + 0.021*"class"')


In [67]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
#ldamodel.save('feedback_model4t_20p_6w.gensim')
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.031*"feedback" + 0.031*"assignment" + 0.027*"going" + 0.024*"grade" + 0.021*"semester" + 0.017*"instructor"')
(1, '0.042*"really" + 0.027*"though" + 0.024*"discussion" + 0.019*"question" + 0.019*"mobile" + 0.015*"grade"')
(2, '0.054*"session" + 0.053*"async" + 0.028*"really" + 0.025*"anything" + 0.024*"actually" + 0.019*"talk"')
(3, '0.023*"class" + 0.015*"async" + 0.015*"frustrate" + 0.015*"would" + 0.015*"could" + 0.015*"video"')
(4, '0.022*"issue" + 0.022*"might" + 0.017*"instructor" + 0.011*"student" + 0.011*"learn" + 0.011*"quality"')
