In [59]:
import numpy as np
import pandas as pd
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

import re
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

import gensim
from gensim import corpora
import pickle

In [2]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [69]:
# load file
chats = pd.read_csv('student_feedback.csv')

In [70]:
chats

Unnamed: 0,feedback_text
0,i have no idea how i’m doing so far. we’ve tur...
1,"i’d ask for more time from the TAs, or try to ..."
2,the way that they encourage lots of class disc...
3,"async is going pretty well, but not super appl..."
4,abe lincoln is the best professor i’ve ever had
5,tech is bad
6,this is a great bot
7,"pretty well, actually! i think i’m finally get..."
8,"the breakout sessions were very engaging, and ..."
9,one person dominated the full-class discussion...


In [71]:
chats=chats.feedback_text

In [72]:
chats[1]

'i’d ask for more time from the TAs, or try to bring  on some additional TAs so they can turn around assignments faster'

In [73]:
chats_tokenized = []
# preprocessing - WILL TAKE 30+ MINS

for i in range(len(chats)):
    tokens = prepare_text_for_lda(str(chats[i]))
    chats_tokenized.append(tokens)
    
chats_tokenized

[['turn',
  'assignment',
  'receive',
  'feedback',
  'super',
  'frustrate',
  'might',
  'great',
  'might',
  'failing'],
 ['bring', 'additional', 'around', 'assignment', 'fast'],
 ['encourage', 'class', 'discussion', 'really'],
 ['async', 'going', 'pretty', 'super', 'applicable', 'session', 'discussion'],
 ['lincoln', 'professor'],
 [],
 ['great'],
 ['pretty', 'actually', 'think', 'finally', 'getting', 'things'],
 ['breakout',
  'sessions',
  'engage',
  'really',
  'enjoy',
  'discussion',
  'question'],
 ['person', 'dominate', 'class', 'discussion', 'annoying', 'overall', 'class'],
 ['problem', 'really', 'helpful', 'get', 'feedback', 'actually'],
 ['async', 'superfluous', 'though', 'really', 'anything', 'talk', 'session'],
 ['could',
  'extra',
  'would',
  'really',
  'problem',
  'feedback',
  'turnaround',
  'things',
  'going',
  'though'],
 ['class', 'going', 'feeling', 'little', 'material'],
 ['session', 'lively', 'discussion', 'class', 'really'],
 ['async',
  'pretty',
  

In [74]:
s = " "
clean_text = s.join(chats)
# clean_text

In [75]:
text = nlp(clean_text)
items = [x.text for x in text.ents]
Counter(items).most_common(10)

[('one', 9),
 (' ', 8),
 ('first', 6),
 ('MIDS', 5),
 ('241', 3),
 ('two', 3),
 ('ML', 3),
 ('this week', 2),
 ('TA', 2),
 ('CNN', 2)]

In [76]:
# 5-10 mins
dictionary = corpora.Dictionary(chats_tokenized)
corpus = [dictionary.doc2bow(text) for text in chats_tokenized]

pickle.dump(corpus, open('corpus_feedback.pkl', 'wb'))
dictionary.save('dictionary_feedback.gensim')

In [77]:
# dictionary = load_from_text('dictionary_feedback.gensim')

In [78]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=5)
ldamodel.save('feedback_model5t_5p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.028*"async" + 0.026*"really" + 0.018*"discussion" + 0.017*"class"')
(1, '0.026*"really" + 0.022*"assignment" + 0.021*"async" + 0.020*"class"')
(2, '0.033*"async" + 0.016*"class" + 0.013*"material" + 0.012*"session"')
(3, '0.038*"class" + 0.026*"really" + 0.014*"question" + 0.012*"great"')
(4, '0.036*"assignment" + 0.028*"feedback" + 0.021*"might" + 0.016*"grade"')


In [79]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10)
ldamodel.save('feedback_model5t_10p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.016*"material" + 0.016*"question" + 0.014*"slack" + 0.013*"might"')
(1, '0.044*"class" + 0.025*"async" + 0.018*"discussion" + 0.016*"really"')
(2, '0.030*"async" + 0.027*"really" + 0.026*"assignment" + 0.020*"class"')
(3, '0.027*"really" + 0.017*"async" + 0.014*"class" + 0.013*"great"')
(4, '0.045*"feedback" + 0.030*"semester" + 0.024*"grade" + 0.016*"assignment"')


In [80]:
NUM_TOPICS = 3
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('feedback_model3t_20p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.032*"class" + 0.029*"async" + 0.022*"session" + 0.013*"feedback"')
(1, '0.016*"async" + 0.015*"assignment" + 0.014*"material" + 0.014*"really"')
(2, '0.026*"really" + 0.020*"assignment" + 0.018*"feedback" + 0.015*"grade"')


In [85]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
#ldamodel.save('feedback_model4t_20p_6w.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.017*"really" + 0.015*"would" + 0.012*"question" + 0.012*"interest" + 0.009*"might"')
(1, '0.049*"class" + 0.032*"async" + 0.027*"really" + 0.017*"session" + 0.014*"material"')
(2, '0.019*"async" + 0.015*"question" + 0.013*"quality" + 0.013*"really" + 0.010*"pretty"')
(3, '0.027*"class" + 0.026*"assignment" + 0.018*"video" + 0.012*"async" + 0.012*"really"')
(4, '0.041*"feedback" + 0.033*"assignment" + 0.024*"grade" + 0.018*"semester" + 0.016*"instructor"')
