## Chat labeling pipeline


In [91]:
import pandas as pd
import re
import json
import csv

import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from spacy.lang.en import English
parser = English()

import gensim
from gensim import corpora
from gensim.summarization.summarizer import summarize
from gensim.models.ldamodel import LdaModel

from pathlib import Path

from operator import itemgetter

import pickle

### Ingest raw chat data

In [93]:
# chats = pd.read_csv('data_inputs/test_export.csv')
# chats_df = pd.read_json('data_inputs/export1.json', lines=True)

chats = []
for line in open('data_inputs/chat_export_2.json', 'r'):
    chats.append(json.loads(line))

In [94]:
chats[1:3]

[{'_id': {'$oid': '5c874ee5fb2cfe24a30f806e'},
  'student_id': 'anon',
  'time': 'Tue Mar 12 2019 05:52:53 GMT+0000 (Coordinated Universal Time)',
  'topic': 'technology',
  'course_content': '201',
  'prof_name': 'Jane Doe',
  'emoji_sentiment': 'pos',
  'response_good': 'I personally am a huge fan of Zoom, and I am so glad Zoom has officially replaced Adobe Connect in ISVC. Adobe Connect audio and video quality was my major concern and I am excited that I can have a great online classroom experience with the transition to Zoom.',
  'response_bad': "It's not 100% clear to me when the assignments are due and where to submit them. I understand Jane's motivation to want to use GitHub but the information on GitHub is not in sync with the information on ISVC. If Jane can make sure everything is aligned and in agreement, that would be a huge help!",
  'response_add_feedback': "Office hour Zoom recordings are incredibly helpful. I particularly like that I can benefit from a different instruc

### Clean data


In [95]:
def sentiment_transform(x):
    return {
        'neg1':1,
        'neg' :2,
        'neu' :3,
        'pos' :4,
        'pos1':5
    }[x]



In [96]:
chats[1]

{'_id': {'$oid': '5c874ee5fb2cfe24a30f806e'},
 'student_id': 'anon',
 'time': 'Tue Mar 12 2019 05:52:53 GMT+0000 (Coordinated Universal Time)',
 'topic': 'technology',
 'course_content': '201',
 'prof_name': 'Jane Doe',
 'emoji_sentiment': 'pos',
 'response_good': 'I personally am a huge fan of Zoom, and I am so glad Zoom has officially replaced Adobe Connect in ISVC. Adobe Connect audio and video quality was my major concern and I am excited that I can have a great online classroom experience with the transition to Zoom.',
 'response_bad': "It's not 100% clear to me when the assignments are due and where to submit them. I understand Jane's motivation to want to use GitHub but the information on GitHub is not in sync with the information on ISVC. If Jane can make sure everything is aligned and in agreement, that would be a huge help!",
 'response_add_feedback': "Office hour Zoom recordings are incredibly helpful. I particularly like that I can benefit from a different instructors' offi

In [97]:
for chat in chats:
    chat['emoji_sentiment_int']= sentiment_transform(chat['emoji_sentiment'])

In [98]:
chats[1]

{'_id': {'$oid': '5c874ee5fb2cfe24a30f806e'},
 'student_id': 'anon',
 'time': 'Tue Mar 12 2019 05:52:53 GMT+0000 (Coordinated Universal Time)',
 'topic': 'technology',
 'course_content': '201',
 'prof_name': 'Jane Doe',
 'emoji_sentiment': 'pos',
 'response_good': 'I personally am a huge fan of Zoom, and I am so glad Zoom has officially replaced Adobe Connect in ISVC. Adobe Connect audio and video quality was my major concern and I am excited that I can have a great online classroom experience with the transition to Zoom.',
 'response_bad': "It's not 100% clear to me when the assignments are due and where to submit them. I understand Jane's motivation to want to use GitHub but the information on GitHub is not in sync with the information on ISVC. If Jane can make sure everything is aligned and in agreement, that would be a huge help!",
 'response_add_feedback': "Office hour Zoom recordings are incredibly helpful. I particularly like that I can benefit from a different instructors' offi

### Apply topic models to chat data


In [99]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [102]:
def pred_topic(text):
    model_filename = 'topic_models/full_topic_model.sav'
    topic_model = pickle.load(open(model_filename, 'rb'))
    result = topic_model.predict([text])[0]
    # print(result)
    return result

NUM_TOPICS = 3
NUM_PASSES = 15

def pred_sub_topic(text, topic):
    # load dictionary
    dictionary_filename = 'topic_dictionaries/%s_dictionary_feedback.gensim' % (str(topic))
    if not Path(dictionary_filename).exists():
        return None
    dictionary = corpora.Dictionary.load(dictionary_filename)
    
    # load model
    ldamodel_filename = 'topic_models/%s_model%st_%sp.gensim' % (str(topic), str(NUM_TOPICS), str(NUM_PASSES))
    if not Path(ldamodel_filename).exists():
        return None
    model = LdaModel.load(ldamodel_filename)
    
    # predict subtopic
    tokens = prepare_text_for_lda(text)
    # print(tokens)
    bow_tokens = dictionary.doc2bow(tokens)
    vector = model[bow_tokens]
    # print(vector)
    most_likely = max(vector,key=itemgetter(1))

    if (most_likely[1] < .34):
        # print("No best predictor")
        return None
    return most_likely[0]
    # print(max(vector,key=itemgetter(1)))
    
    

In [103]:
pred_sub_topic('testing feedback slack', 'instructors')

0

In [104]:
pred_topic('he really struggles with communication')

'instructors'

In [107]:
for chat in chats:
    chat['pred_topic'] = pred_topic(chat['topic']) if chat['topic'] == 'miscellaneous' else None
    chat['pred_sub_topic'] = pred_sub_topic(chat['reponse_full'], chat['topic'])
    # chat['pred_sub_topic'] = chat['topic']

### Export appended data

In [108]:
with open('data_outputs/updated_chatdata.csv', 'w') as f:  # Just use 'w' mode in 3.x
    w = csv.DictWriter(f, chats[1].keys())
    w.writeheader()
    for chat in chats:
        w = csv.DictWriter(f, chat.keys())
        w.writerow(chat)

## Chat summaries by topic

### Generate topic summaries


In [111]:
topics = set()

for chat in chats:
    topics.add(chat['topic'])
    
topics

{'assignment', 'live session', 'technology'}

In [113]:
for topic in topics:
    print ("Processing", topic)
    topic_subset = [d for d in chats if d['topic'] == topic]
    topic_text = []
    for chat in topic_subset:
        topic_text.append(chat['reponse_full'])
    print(topic_text)
    
    
    s = '. '
    text = s.join(topic_text)
    text = re.sub("\.+", ".", text)
    text = re.sub("\n", "", text)
    # print(text)
    print(summarize(text))
    print('\n')


Processing live session
["Groups keep going over their alotted time to present and the instructors don't interceed. I'd ask them to be more strict and maybe tell the students when they have 5 and 1 minute left I'd ask them to cut the groups off when they exceed their time"]



Processing technology
["I personally am a huge fan of Zoom, and I am so glad Zoom has officially replaced Adobe Connect in ISVC. Adobe Connect audio and video quality was my major concern and I am excited that I can have a great online classroom experience with the transition to Zoom. It's not 100% clear to me when the assignments are due and where to submit them. I understand Jane's motivation to want to use GitHub but the information on GitHub is not in sync with the information on ISVC. If Jane can make sure everything is aligned and in agreement, that would be a huge help! Office hour Zoom recordings are incredibly helpful. I particularly like that I can benefit from a different instructors' office hours as w

ValueError: input must have more than one sentence

### Write to file