In [6]:
import numpy as np
import pandas as pd
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

import re
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

import gensim
from gensim import corpora
from gensim.summarization.summarizer import summarize
import pickle


In [7]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

### Load training data

In [8]:
# load file
training_feedback = pd.read_csv('training_data/feedback_data_for_training.csv')

In [9]:
training_feedback

Unnamed: 0,source,feedback_text,topic,sub_topic,Unnamed: 4
0,team_generated,i have no idea how i’m doing so far. we’ve tur...,assignments,slow turnaround,
1,team_generated,"i’d ask for more time from the TAs, or try to ...",assignments,slow turnaround,
2,team_generated,the way that they encourage lots of class disc...,live session,good discussion,
3,team_generated,"async is going pretty well, but not super appl...",async,disconnected from live session,
4,team_generated,abe lincoln is the best professor i’ve ever had,instructors,,
5,team_generated,tech is bad,technology,negative,
6,team_generated,this is a great bot,technology,positive,
7,team_generated,"the breakout sessions were very engaging, and ...",live session,good discussion,
8,team_generated,one person dominated the full-class discussion...,live session,unbalanced,
9,team_generated,"they’re fine. problem sets are really helpful,...",assignments,slow turnaround,


In [10]:
topics = set(training_feedback.topic)
topics

{'assignments',
 'async',
 'instructors',
 'live session',
 'miscellaneous',
 'technology'}

### Create topic models


In [11]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

training_subset = training_feedback[(training_feedback['topic'] != 'misc')]

training_text = training_subset.feedback_text
training_targets = np.array(training_subset.topic)
training_text_clean = []
for text in training_text:
    training_text_clean.append(" ".join(prepare_text_for_lda(text)))  
training_text_clean = np.reshape(training_text_clean,(-1,))

In [12]:
training_targets

array(['assignments', 'assignments', 'live session', 'async',
       'instructors', 'technology', 'technology', 'live session',
       'live session', 'assignments', 'async', 'assignments',
       'live session', 'async', 'assignments', 'assignments',
       'live session', 'live session', 'async', 'assignments',
       'live session', 'live session', 'async', 'assignments', 'async',
       'assignments', 'live session', 'live session', 'async',
       'live session', 'async', 'technology', 'async', 'technology',
       'technology', 'technology', 'async', 'async', 'technology',
       'async', 'async', 'async', 'async', 'instructors', 'instructors',
       'assignments', 'assignments', 'technology', 'miscellaneous',
       'miscellaneous', 'async', 'miscellaneous', 'async', 'live session',
       'assignments', 'assignments', 'assignments', 'assignments',
       'assignments', 'assignments', 'assignments', 'assignments',
       'miscellaneous', 'miscellaneous', 'assignments', 'assignm

In [13]:
#train_x, test_x, train_y, test_y = model_selection.train_test_split(training_feedback['feedback_text'], 
train_x, test_x, train_y, test_y = model_selection.train_test_split(training_text_clean, 
                                                                      training_targets)

In [14]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((177,), (177,), (60,), (60,))

In [15]:
#train_x = train_x.reshape(-1, 1)
#test_x = test_x.reshape(-1, 1)
#train_y = train_y.reshape(-1, 1)
#test_y = test_y.reshape(-1, 1)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

train_x_counts = count_vect.fit_transform(train_x)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
train_x_tfidf = tfidf_transformer.fit_transform(train_x_counts)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_x_tfidf, train_y)

predicted = clf.predict(tfidf_transformer.transform(count_vect.transform(test_x)))
np.mean(predicted == test_y)


0.38333333333333336

In [19]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)  
             }

clf = MultinomialNB()
gs_clf = GridSearchCV(estimator=clf, param_grid=parameters, n_jobs=-1, iid=True)

gs_clf = gs_clf.fit(train_x_counts, train_y)

gs_clf.best_score_
gs_clf.best_params_

print("Optimal alpha = {}; best MNB score = {:.3}".format(gs_clf.best_params_, gs_clf.best_score_))


Optimal alpha = {'alpha': 1}; best MNB score = 0.537




In [20]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)  
             }

clf = MultinomialNB()
gs_clf = GridSearchCV(estimator=clf, param_grid=parameters, n_jobs=-1, iid=True)

gs_clf = gs_clf.fit(train_x_tfidf, train_y)

gs_clf.best_score_
gs_clf.best_params_

print("Optimal alpha = {}; best MNB score = {:.3}".format(gs_clf.best_params_, gs_clf.best_score_))

Optimal alpha = {'alpha': 0.1}; best MNB score = 0.486




In [24]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

# text_clf.fit(train_x, train_y)  
# predicted = text_clf.predict(test_x)
# np.mean(predicted == test_y) 

parameters = {
     'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
     'tfidf__use_idf': (True, False),
     'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001) ,
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(train_x, train_y)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.527592424960846
{'clf__alpha': 0.1, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}


### Create subtopic models, by topic

In [25]:
for topic in topics:
    print ("Processing", topic)
    topic_subset = training_feedback[training_feedback.topic==topic]
    topic_text = topic_subset.feedback_text
    # print(text)
    
    chats_tokenized = []
    
    for t in topic_text:
        tokens = prepare_text_for_lda(t)
        chats_tokenized.append(tokens)
        
    dictionary = corpora.Dictionary(chats_tokenized)
    dictionary_filename = "topic_dictionaries/%s_dictionary_feedback.gensim" % (str(topic))
    corpus = [dictionary.doc2bow(text) for text in chats_tokenized]
    corpus_filename = "topic_corpi/%s_corpus_feedback.pkl" % (str(topic))
    

    pickle.dump(corpus, open(corpus_filename, 'wb'))
    dictionary.save(dictionary_filename)

Processing async
Processing miscellaneous
Processing assignments
Processing instructors
Processing technology
Processing live session


In [26]:
NUM_TOPICS = 3
NUM_PASSES = 15

for topic in topics:
    print(topic)
    dictionary_filename = "topic_dictionaries/%s_dictionary_feedback.gensim" % (str(topic))
    dictionary = corpora.Dictionary.load(dictionary_filename)
    corpus_filename = "topic_corpi/%s_corpus_feedback.pkl" % (str(topic))
    with open(corpus_filename, 'rb') as f:
        corpus = pickle.load(f)
    if (len(dictionary) == 0):
        continue
    
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=NUM_PASSES)
    ldamodel_filename = 'topic_models/%s_model%st_%sp.gensim' % (str(topic), str(NUM_TOPICS), str(NUM_PASSES))
    ldamodel.save(ldamodel_filename)
    
    sub_topics = ldamodel.print_topics(num_words=5)
    for sub_topic in sub_topics:
        print(sub_topic)
    
    


async
(0, '0.029*"found" + 0.020*"reading" + 0.020*"discussion" + 0.020*"textbook" + 0.020*"actually"')
(1, '0.092*"async" + 0.034*"session" + 0.026*"really" + 0.023*"pretty" + 0.019*"quality"')
(2, '0.045*"class" + 0.030*"async" + 0.024*"reading" + 0.023*"really" + 0.023*"asyncs"')
miscellaneous
(0, '0.043*"reading" + 0.030*"speed" + 0.030*"material" + 0.030*"assignment" + 0.030*"going"')
(1, '0.051*"class" + 0.036*"impossible" + 0.021*"definitely" + 0.020*"series" + 0.020*"extend"')
(2, '0.026*"light" + 0.026*"speaker" + 0.026*"system" + 0.026*"could" + 0.026*"pretty"')
assignments
(0, '0.048*"assignment" + 0.044*"feedback" + 0.026*"grade" + 0.024*"semester" + 0.024*"project"')
(1, '0.037*"assignment" + 0.016*"grade" + 0.015*"might" + 0.015*"instructor" + 0.015*"feedback"')
(2, '0.019*"class" + 0.019*"feedback" + 0.015*"async" + 0.015*"question" + 0.015*"might"')
instructors
(0, '0.025*"class" + 0.015*"would" + 0.015*"really" + 0.014*"sessions" + 0.011*"discussion"')
(1, '0.035*"clas

### Generate topic summaries

In [27]:
for topic in topics:
    print ("Processing", topic)
    topic_subset = training_feedback[training_feedback.topic==topic]
    topic_text = topic_subset.feedback_text
    
    s = '. '
    text = s.join(topic_text)
    text = re.sub("\.+", ".", text)
    text = re.sub("\n", "", text)
    # print(text)
    print(summarize(text))
    print('\n')

Processing async
async is going pretty well, but not super applicable to the live session discussion.
it felt like a lot of the material was covered multiple times, and then AGAIN in the live session :sleepy:.
async felt a little long this week, it didn’t seem super relevant to what we talked about in live session.
i feel like the async for the last couple of weeks hasn’t had anything to do with what we’ve actually talked about in live session, but it has been interesting at least.
live session hasn’t really had anything to do with the async lectures.
i’m not really sure whether live session or async is more relevant to what we’re supposed to be taking out of this class :disappointed:.
I really appreciated 209's Codepen exercises, and so far I think 241's integrated R Studio work is quite good, but the random quizzes or short response questions that predominate in async are terrible.
I really appreciated 209's Codepen exercises, and so far I think 241's integrated R Studio work is quit