## Chat labeling pipeline


In [102]:
import pandas as pd
import re
import json
import csv

import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from spacy.lang.en import English
parser = English()

import gensim
from gensim import corpora
from gensim.summarization.summarizer import summarize
from gensim.models.ldamodel import LdaModel

from pathlib import Path

### Ingest raw chat data

In [57]:
# chats = pd.read_csv('data_inputs/test_export.csv')
# chats_df = pd.read_json('data_inputs/export1.json', lines=True)

chats = []
for line in open('data_inputs/chat_export1.json', 'r'):
    chats.append(json.loads(line))

In [58]:
chats[1:3]

[{'_id': {'$oid': '5c7acb202f46c1066e31fd5b'},
  'student_id': 'UFR6C3N5B',
  'time': 'Sat Mar 02 2019 18:27:10 GMT+0000 (Coordinated Universal Time)',
  'topic': 'live session',
  'course': 'a',
  'prof_name': 'a',
  'instr_wk': 'a',
  'emoji_sentiment': 'pos1',
  'topic_response': 'a'},
 {'_id': {'$oid': '5c7ae81e2f46c1066e31fd5c'},
  'student_id': 'UFR6C3N5B',
  'time': 'Sat Mar 02 2019 18:27:10 GMT+0000 (Coordinated Universal Time)',
  'topic': 'office hours',
  'course': '203',
  'prof_name': 'john do',
  'instr_wk': '2',
  'emoji_sentiment': 'neg',
  'topic_response': 'not much'}]

### Clean data


In [59]:
def sentiment_transform(x):
    return {
        'neg1':1,
        'neg' :2,
        'neu' :3,
        'pos' :4,
        'pos1':5
    }[x]



In [60]:
chats[1]

{'_id': {'$oid': '5c7acb202f46c1066e31fd5b'},
 'student_id': 'UFR6C3N5B',
 'time': 'Sat Mar 02 2019 18:27:10 GMT+0000 (Coordinated Universal Time)',
 'topic': 'live session',
 'course': 'a',
 'prof_name': 'a',
 'instr_wk': 'a',
 'emoji_sentiment': 'pos1',
 'topic_response': 'a'}

In [61]:
for chat in chats:
    chat['emoji_sentiment_int']= sentiment_transform(chat['emoji_sentiment'])

In [62]:
chats[1]

{'_id': {'$oid': '5c7acb202f46c1066e31fd5b'},
 'student_id': 'UFR6C3N5B',
 'time': 'Sat Mar 02 2019 18:27:10 GMT+0000 (Coordinated Universal Time)',
 'topic': 'live session',
 'course': 'a',
 'prof_name': 'a',
 'instr_wk': 'a',
 'emoji_sentiment': 'pos1',
 'topic_response': 'a',
 'emoji_sentiment_int': 5}

### Apply topic models to chat data


In [99]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [133]:
def pred_topic(text):
    return text

NUM_TOPICS = 3
NUM_PASSES = 15

def pred_sub_topic(text, topic):
    # load dictionary
    dictionary_filename = 'topic_dictionaries/%s_dictionary_feedback.gensim' % (str(topic))
    if not Path(dictionary_filename).exists():
        return None
    dictionary = corpora.Dictionary.load(dictionary_filename)
    
    # load model
    ldamodel_filename = 'topic_models/%s_model%st_%sp.gensim' % (str(topic), str(NUM_TOPICS), str(NUM_PASSES))
    if not Path(ldamodel_filename).exists():
        return None
    model = LdaModel.load(ldamodel_filename)
    
    # predict subtopic
    tokens = prepare_text_for_lda(text)
    print(tokens)
    bow_tokens = dictionary.doc2bow(tokens)
    vector = model[bow_tokens]
    print(vector)
    return None


In [136]:
pred_sub_topic('testing feedback responsive slack', 'instructors')

['testing', 'feedback', 'responsive', 'slack']
[(0, 0.758591), (1, 0.12270975), (2, 0.1186993)]


In [137]:
for chat in chats:
    # chat['pred_topic'] = pred_topic(chat['topic_response'])
    chat['pred_topic'] = chat['topic']
    chat['pred_sub_topic'] = pred_sub_topic(chat['topic_response'], chat['topic'])
    # chat['pred_sub_topic'] = chat['topic']

[]
[(0, 0.33333334), (1, 0.33333334), (2, 0.33333334)]
[]
[(0, 0.33333334), (1, 0.33333334), (2, 0.33333334)]
[]
[(0, 0.33333334), (1, 0.33333334), (2, 0.33333334)]
[]
[(0, 0.33333334), (1, 0.33333334), (2, 0.33333334)]


### Export appended data

In [114]:
with open('data_outputs/updated_chatdata.csv', 'w') as f:  # Just use 'w' mode in 3.x
    w = csv.DictWriter(f, chats[1].keys())
    w.writeheader()
    for chat in chats:
        w = csv.DictWriter(f, chat.keys())
        w.writerow(chat)

## Chat summaries by topic

### Generate topic summaries


In [None]:
topics = set(chats[topic])

for topic in topics:
    print ("Processing", topic)
    topic_subset = training_feedback[training_feedback.topic==topic]
    topic_text = topic_subset.feedback_text
    
    s = '. '
    text = s.join(topic_text)
    text = re.sub("\.+", ".", text)
    text = re.sub("\n", "", text)
    # print(text)
    print(summarize(text))
    print('\n')

In [72]:
topics = set()

for chat in chats:
    topics.add(chat['topic'])
    
topics

{'assignment', 'instructor', 'live session', 'miscellaneous', 'office hours'}

In [81]:
for topic in topics:
    print ("Processing", topic)
    topic_subset = [d for d in chats if d['topic'] == topic]
    topic_text = []
    for chat in topic_subset:
        topic_text.append(chat['topic_response'])
    print(topic_text)
    
    
    s = '. '
    text = s.join(topic_text)
    text = re.sub("\.+", ".", text)
    text = re.sub("\n", "", text)
    # print(text)
    print(summarize(text))
    print('\n')


Processing miscellaneous
['I think the bridge course is so fun! wanted to compliment paul, the course creators, and admins for doing such a great job in designing such a helpful course', 'nothing']



Processing live session
['a', 'a']



Processing office hours
['not much']



Processing instructor
['a']



Processing assignment
['a']





### Write to file