## Chat labeling pipeline


In [58]:
import pandas as pd
import re
import json
import csv

import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from spacy.lang.en import English
parser = English()

import gensim
from gensim import corpora
from gensim.summarization.summarizer import summarize
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases

from pathlib import Path

from operator import itemgetter

import pickle

### Ingest raw chat data

In [93]:
# chats = pd.read_csv('data_inputs/test_export.csv')
# chats_df = pd.read_json('data_inputs/export1.json', lines=True)

chats = []
for line in open('data_inputs/chat_export_2.json', 'r'):
    chats.append(json.loads(line))

In [115]:
chats[1:2]

[{'_id': {'$oid': '5c874ee5fb2cfe24a30f806e'},
  'student_id': 'anon',
  'time': 'Tue Mar 12 2019 05:52:53 GMT+0000 (Coordinated Universal Time)',
  'topic': 'technology',
  'course_content': '201',
  'prof_name': 'Jane Doe',
  'emoji_sentiment': 'pos',
  'response_good': 'I personally am a huge fan of Zoom, and I am so glad Zoom has officially replaced Adobe Connect in ISVC. Adobe Connect audio and video quality was my major concern and I am excited that I can have a great online classroom experience with the transition to Zoom.',
  'response_bad': "It's not 100% clear to me when the assignments are due and where to submit them. I understand Jane's motivation to want to use GitHub but the information on GitHub is not in sync with the information on ISVC. If Jane can make sure everything is aligned and in agreement, that would be a huge help!",
  'response_add_feedback': "Office hour Zoom recordings are incredibly helpful. I particularly like that I can benefit from a different instruc

In [None]:
# json file
chats = []
for line in open('data_inputs/chat_export_2.json', 'r'):
    chats.append(json.loads(line))
    

In [62]:
# csv file
chats = pd.read_csv('training_data/feedback_data_for_training.csv')


In [63]:
chats[1:10]

Unnamed: 0,source,feedback_text,topic,sub_topic
1,team_generated,"i’d ask for more time from the TAs, or try to ...",assignments,slow turnaround
2,team_generated,the way that they encourage lots of class disc...,live session,good discussion
3,team_generated,"async is going pretty well, but not super appl...",async,disconnected from live session
4,team_generated,abe lincoln is the best professor i’ve ever had,instructors,
5,team_generated,tech is bad,technology,negative
6,team_generated,this is a great bot,technology,positive
7,team_generated,"the breakout sessions were very engaging, and ...",live session,good discussion
8,team_generated,one person dominated the full-class discussion...,live session,unbalanced
9,team_generated,"they’re fine. problem sets are really helpful,...",assignments,slow turnaround


### Clean data


In [95]:
def sentiment_transform(x):
    return {
        'neg1':1,
        'neg' :2,
        'neu' :3,
        'pos' :4,
        'pos1':5
    }[x]



In [96]:
chats[1]

{'_id': {'$oid': '5c874ee5fb2cfe24a30f806e'},
 'student_id': 'anon',
 'time': 'Tue Mar 12 2019 05:52:53 GMT+0000 (Coordinated Universal Time)',
 'topic': 'technology',
 'course_content': '201',
 'prof_name': 'Jane Doe',
 'emoji_sentiment': 'pos',
 'response_good': 'I personally am a huge fan of Zoom, and I am so glad Zoom has officially replaced Adobe Connect in ISVC. Adobe Connect audio and video quality was my major concern and I am excited that I can have a great online classroom experience with the transition to Zoom.',
 'response_bad': "It's not 100% clear to me when the assignments are due and where to submit them. I understand Jane's motivation to want to use GitHub but the information on GitHub is not in sync with the information on ISVC. If Jane can make sure everything is aligned and in agreement, that would be a huge help!",
 'response_add_feedback': "Office hour Zoom recordings are incredibly helpful. I particularly like that I can benefit from a different instructors' offi

In [97]:
for chat in chats:
    chat['emoji_sentiment_int']= sentiment_transform(chat['emoji_sentiment'])

In [98]:
chats[1]

{'_id': {'$oid': '5c874ee5fb2cfe24a30f806e'},
 'student_id': 'anon',
 'time': 'Tue Mar 12 2019 05:52:53 GMT+0000 (Coordinated Universal Time)',
 'topic': 'technology',
 'course_content': '201',
 'prof_name': 'Jane Doe',
 'emoji_sentiment': 'pos',
 'response_good': 'I personally am a huge fan of Zoom, and I am so glad Zoom has officially replaced Adobe Connect in ISVC. Adobe Connect audio and video quality was my major concern and I am excited that I can have a great online classroom experience with the transition to Zoom.',
 'response_bad': "It's not 100% clear to me when the assignments are due and where to submit them. I understand Jane's motivation to want to use GitHub but the information on GitHub is not in sync with the information on ISVC. If Jane can make sure everything is aligned and in agreement, that would be a huge help!",
 'response_add_feedback': "Office hour Zoom recordings are incredibly helpful. I particularly like that I can benefit from a different instructors' offi

### Apply topic models to chat data


In [7]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

def prepare_document(doc):
    
    tokenized = []
    
    tokens = prepare_text_for_lda(str(doc))
    tokenized.append(tokens)

    # Add bigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(tokenized, min_count=20)
    for idx in range(len(tokenized)):
        for token in bigram[tokenized[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                tokenized[idx].append(token)
        
    return tokenized

In [52]:
def pred_topic(text):
    model_filename = 'topic_models/full_topic_model.sav'
    topic_model = pickle.load(open(model_filename, 'rb'))
    result = topic_model.predict([text])[0]
    # print(result)
    return result

def pred_sub_topic(text, dictionary, model):
    
    # predict subtopic
    tokens = prepare_document(text)
    
    print(tokens)
    bow_tokens = dictionary.doc2bow(tokens[0])
    vector = model[bow_tokens]
    print(vector)
    most_likely = max(vector,key=itemgetter(1))
    vector = sorted(((v,k) for k,v in vector))
    # print (vector)
    
    if (vector[-1][1] == 5):
        return vector[-2][1]
    return vector[-1][1]
    
    

In [53]:
NUM_TOPICS = 14
NUM_PASSES = 300

# load dictionary
dictionary_filename = "topic_dictionaries/%s_dictionary_feedback.gensim" % ("sciences")
dictionary = corpora.Dictionary.load(dictionary_filename)
    
# load model
ldamodel_filename = 'topic_models/%s_model%st_%sp.gensim' % ("sciences", str(NUM_TOPICS), str(NUM_PASSES))
model = LdaModel.load(ldamodel_filename)

In [55]:
pred_sub_topic('this is a test to see what comes back office hours office hours', dictionary, model)

[['come', 'office', 'hours', 'office', 'hours']]
[(0, 0.013133085), (1, 0.047014322), (2, 0.022961633), (3, 0.017972628), (4, 0.03811367), (5, 0.39596027), (6, 0.024542574), (7, 0.1266452), (8, 0.028104296), (9, 0.047045887), (10, 0.023035506), (11, 0.17412157), (12, 0.013958988), (13, 0.027390378)]


11

In [56]:
pred_sub_topic('exams, quizzes, tests, midterms, this should all come back as exams and tests', dictionary, model)

[['exam', 'quiz', 'test', 'midterm', 'exam', 'test']]
[(0, 0.012616726), (1, 0.20241839), (2, 0.02205884), (3, 0.017265989), (4, 0.036615137), (5, 0.41970998), (6, 0.023577621), (7, 0.12166584), (8, 0.026999306), (9, 0.04519616), (10, 0.022129808), (11, 0.01002254), (12, 0.013410156), (13, 0.026313458)]


1

In [57]:
pred_sub_topic('this is a test', dictionary, model)

[[]]
[(0, 0.016511416), (1, 0.0591082), (2, 0.028868241), (3, 0.022595871), (4, 0.04791796), (5, 0.446374), (6, 0.030855859), (7, 0.15922318), (8, 0.035333794), (9, 0.059147883), (10, 0.028961116), (11, 0.013116423), (12, 0.017549772), (13, 0.034436226)]


7

In [107]:
for chat in chats:
    # chat['pred_topic'] = pred_topic(chat['topic']) if chat['topic'] == 'miscellaneous' else None
    chat['pred_sub_topic'] = pred_sub_topic(chat['reponse_full'], dictionary, model)
    

### Export appended data

In [108]:
with open('data_outputs/updated_chatdata.csv', 'w') as f:  # Just use 'w' mode in 3.x
    w = csv.DictWriter(f, chats[1].keys())
    w.writeheader()
    for chat in chats:
        w = csv.DictWriter(f, chat.keys())
        w.writerow(chat)

## Chat summaries by topic

### Generate topic summaries


In [111]:
topics = set()

for chat in chats:
    topics.add(chat['topic'])
    
topics

{'assignment', 'live session', 'technology'}

In [113]:
for topic in topics:
    print ("Processing", topic)
    topic_subset = [d for d in chats if d['topic'] == topic]
    topic_text = []
    for chat in topic_subset:
        topic_text.append(chat['reponse_full'])
    print(topic_text)
    
    
    s = '. '
    text = s.join(topic_text)
    text = re.sub("\.+", ".", text)
    text = re.sub("\n", "", text)
    # print(text)
    print(summarize(text))
    print('\n')


Processing live session
["Groups keep going over their alotted time to present and the instructors don't interceed. I'd ask them to be more strict and maybe tell the students when they have 5 and 1 minute left I'd ask them to cut the groups off when they exceed their time"]



Processing technology
["I personally am a huge fan of Zoom, and I am so glad Zoom has officially replaced Adobe Connect in ISVC. Adobe Connect audio and video quality was my major concern and I am excited that I can have a great online classroom experience with the transition to Zoom. It's not 100% clear to me when the assignments are due and where to submit them. I understand Jane's motivation to want to use GitHub but the information on GitHub is not in sync with the information on ISVC. If Jane can make sure everything is aligned and in agreement, that would be a huge help! Office hour Zoom recordings are incredibly helpful. I particularly like that I can benefit from a different instructors' office hours as w

ValueError: input must have more than one sentence

### Write to file