In [75]:
import pickle as pkl
from WebQA.core.QA_Page import QA_Page
from WebQA.pages.Medhelp import MedhelpPage
from WebQA.pages.Healthtap import HealthtapPage
from WebQA.core.Post import Post, Question, Answer
from WebQA.core.User import User
import pandas as pd
import sys
import os

In [76]:
HOME = os.path.expanduser("~")
DATA = HOME + "/data_buffer/WebQA/"

import unicodedata
import string
import re


def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([›.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = ' '.join(s.split())
    return s

In [77]:
def turn_to_pairs_qa():
    
    with open(DATA + "qa.pkl", "rb") as f:
        pages = pd.read_pickle(f)
    instances = []
    for page in pages:
        if page and page.question != "":
            
            answers = []
            for answer in page.answers:
                if answer is not None and answer.user is not None and answer.user.isMedical: # only take professional's answer
                    answers.append(normalize_string(answer.text))
            
            if len(answers) > 0:
                instances.append([normalize_string(page.question.text), answers])
                
                
    training_pairs = []
    for instance in instances:
        question = instance[0]
        answers = instance[1]
        for answer in answers:
            if answer != "" and len(question.split('\t')) == 1 and len(answer.split('\t')) == 1:
                training_pairs.append(question  + "\t" + answer)
    return training_pairs

In [78]:
def turn_to_pairs_health():
    with open(DATA + "healthtap.pkl", "rb") as f:
        pages = pd.read_pickle(f)
    instances = []
    for page in pages:
        if page and page.question != "":
            
            answers = []
            for answer in page.answers:
                if answer is not None and answer.text != "": # only take professional's answer
                    answers.append(normalize_string(answer.text))
            
            if len(answers) > 0:
                instances.append([normalize_string(page.question.text), answers])
                
                
    training_pairs = []
    for instance in instances:
        question = instance[0]
        answers = instance[1]
        for answer in answers:
            if answer != "" and len(question.split('\t')) == 1 and len(answer.split('\t')) == 1:
                training_pairs.append(question  + "\t" + answer)
    return training_pairs

In [79]:
def create_tsv(filename: str = None, content: list = None):
    with open(filename, "w") as record_file:
        for item in content:
            record_file.write("%s\n" % item)

In [80]:
training_pairsQA = turn_to_pairs_qa()
training_pairsHealth = turn_to_pairs_health()

In [81]:
training_pairsQA.sort(key=len)
training_pairsHealth.sort(key=len)

In [82]:
create_tsv('QA_dataset.tsv', training_pairsQA)
create_tsv('Health_dataset.tsv', training_pairsHealth)

In [74]:
len(s.split())

50

'my three years old has had a normal type of day she laid down for her afternoon nap and woke with a fever . abdominal pain as described by child her tummy hurts and she has vomited twice in an hours time . after vomiting she feels better but her fever is still present . on friday she had received two standard booster shots and the influenza nasal treatment all given by her dr . am i just looking at a bug or should there be more concern ? hello your daughter could be having a reaction to the influenza treatment after all they give you a little of the disease to help build immunity to it . monitor her tonight and if she is not better tomorrow or worse then get her back into the doctor it is better to be safe than sorry . susie'

In [26]:
f = open(DATA + 'Health_dataset.tsv', 'r').readlines()

In [35]:
lines = [item[:item.find('dr.')] for item in f]

with open(DATA + 'Health_dataset.tsv', "w") as record_file:
        for item in lines:
            record_file.write("%s\n" % item)



['does', 'drug', 'therapy', 'work', 'for', 'anxiety', 'disorders', '?']

In [33]:
lines[0].split('\t')[1].split()

['in',
 'brief',
 'yes',
 'both',
 'psychotherapy',
 'and',
 'medications',
 'can',
 'help',
 'people',
 'with',
 'anxiety',
 'disorders.',
 'some',
 'people',
 'benefit',
 'from',
 'one',
 'or',
 'both',
 'of',
 'these',
 'types',
 'of',
 'treatment',
 'more',
 'than',
 'the',
 'other.',
 'anxiety',
 'disorders',
 'tend',
 'to',
 'be',
 'fairly',
 'chronic',
 'for',
 'many',
 'people',
 'and',
 'so',
 'it',
 'is',
 'important',
 'to',
 'look',
 'for',
 'long',
 'term',
 'solutions.',
 'in',
 'brief',
 'yes',
 'both',
 'psychotherapy',
 'and',
 'medications',
 'can',
 'help',
 'people',
 'with',
 'anxiety',
 'disorders.',
 'some',
 'people',
 'benefit',
 'from',
 'one',
 'or',
 'both',
 'of',
 'these',
 'types',
 'of',
 'treatment',
 'more',
 'than',
 'the',
 'other.',
 'anxiety',
 'disorders',
 'tend',
 'to',
 'be',
 'fairly',
 'chronic',
 'for',
 'many',
 'people',
 'and',
 'so',
 'it',
 'is',
 'important',
 'to',
 'look',
 'for',
 'long',
 'term',
 'solutions.',
 'would',
 'you',
 '

In [None]:
for line in lines:
    pair = line.split('\t')
    question = 