## Chat labeling pipeline


In [80]:
import pandas as pd
import re
import json
import csv

import gensim
from gensim import corpora
from gensim.summarization.summarizer import summarize

### Ingest raw chat data

In [57]:
# chats = pd.read_csv('data_inputs/test_export.csv')
# chats_df = pd.read_json('data_inputs/export1.json', lines=True)

chats = []
for line in open('data_inputs/chat_export1.json', 'r'):
    chats.append(json.loads(line))

In [58]:
chats[1:3]

[{'_id': {'$oid': '5c7acb202f46c1066e31fd5b'},
  'student_id': 'UFR6C3N5B',
  'time': 'Sat Mar 02 2019 18:27:10 GMT+0000 (Coordinated Universal Time)',
  'topic': 'live session',
  'course': 'a',
  'prof_name': 'a',
  'instr_wk': 'a',
  'emoji_sentiment': 'pos1',
  'topic_response': 'a'},
 {'_id': {'$oid': '5c7ae81e2f46c1066e31fd5c'},
  'student_id': 'UFR6C3N5B',
  'time': 'Sat Mar 02 2019 18:27:10 GMT+0000 (Coordinated Universal Time)',
  'topic': 'office hours',
  'course': '203',
  'prof_name': 'john do',
  'instr_wk': '2',
  'emoji_sentiment': 'neg',
  'topic_response': 'not much'}]

### Clean data


In [59]:
def sentiment_transform(x):
    return {
        'neg1':1,
        'neg' :2,
        'neu' :3,
        'pos' :4,
        'pos1':5
    }[x]



In [60]:
chats[1]

{'_id': {'$oid': '5c7acb202f46c1066e31fd5b'},
 'student_id': 'UFR6C3N5B',
 'time': 'Sat Mar 02 2019 18:27:10 GMT+0000 (Coordinated Universal Time)',
 'topic': 'live session',
 'course': 'a',
 'prof_name': 'a',
 'instr_wk': 'a',
 'emoji_sentiment': 'pos1',
 'topic_response': 'a'}

In [61]:
for chat in chats:
    chat['emoji_sentiment_int']= sentiment_transform(chat['emoji_sentiment'])

In [62]:
chats[1]

{'_id': {'$oid': '5c7acb202f46c1066e31fd5b'},
 'student_id': 'UFR6C3N5B',
 'time': 'Sat Mar 02 2019 18:27:10 GMT+0000 (Coordinated Universal Time)',
 'topic': 'live session',
 'course': 'a',
 'prof_name': 'a',
 'instr_wk': 'a',
 'emoji_sentiment': 'pos1',
 'topic_response': 'a',
 'emoji_sentiment_int': 5}

### Apply topic models to chat data


In [63]:
def pred_topic(text):
    return text

def pred_sub_topic(text):
    return text


In [64]:
for chat in chats:
    chat['pred_topic'] = pred_topic(chat['topic_response'])
    chat['pred_sub_topic'] = pred_sub_topic(chat['topic_response'])

### Export appended data

In [67]:
with open('data_outputs/updated_chatdata.csv', 'w') as f:  # Just use 'w' mode in 3.x
    w = csv.DictWriter(f, chats[1].keys())
    w.writeheader()
    for chat in chats:
        w = csv.DictWriter(f, chat.keys())
        w.writerow(chat)

### Generate topic summaries


In [None]:
topics = set(chats[topic])

for topic in topics:
    print ("Processing", topic)
    topic_subset = training_feedback[training_feedback.topic==topic]
    topic_text = topic_subset.feedback_text
    
    s = '. '
    text = s.join(topic_text)
    text = re.sub("\.+", ".", text)
    text = re.sub("\n", "", text)
    # print(text)
    print(summarize(text))
    print('\n')

In [72]:
topics = set()

for chat in chats:
    topics.add(chat['topic'])
    
topics

{'assignment', 'instructor', 'live session', 'miscellaneous', 'office hours'}

In [81]:
for topic in topics:
    print ("Processing", topic)
    topic_subset = [d for d in chats if d['topic'] == topic]
    topic_text = []
    for chat in topic_subset:
        topic_text.append(chat['topic_response'])
    print(topic_text)
    
    
    s = '. '
    text = s.join(topic_text)
    text = re.sub("\.+", ".", text)
    text = re.sub("\n", "", text)
    # print(text)
    print(summarize(text))
    print('\n')


Processing miscellaneous
['I think the bridge course is so fun! wanted to compliment paul, the course creators, and admins for doing such a great job in designing such a helpful course', 'nothing']



Processing live session
['a', 'a']



Processing office hours
['not much']



Processing instructor
['a']



Processing assignment
['a']



