# EDA for QA4MRE dataset

## Imports and Setup

In [1]:
import csv
import requests
import xml.etree.ElementTree as ET
#import stanford_parser
import nltk
import pandas as pd
import re
import json
#from stanford_parser import sentence_split

In [2]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

## Input .XML file, organize .CSV

In [3]:
# create element tree object 
def parse_file(xmlfiles,yrs,filename):
   
    # open a file for writing

    QA_data = open(filename, 'w')

    # create the csv writer object

    csvwriter = csv.writer(QA_data)

    # write column names
    headings = ['YR','T_ID','T_NAME','R_ID','D_ID','D_STR','Q_ID','Q_STR','A_ID','A_STR','CORRECT']
    csvwriter.writerow(headings)
    
    for i,xmlfile in enumerate(xmlfiles):
        print("Opening...",xmlfile)
        
        tree = ET.parse(xmlfile)
        yr = yrs[i]
  
        # get root element 
        root = tree.getroot() 

        #rows = []

        for topic in root.iter('topic'):
            t_id = topic.attrib['t_id']
            t_name = topic.attrib['t_name']
            #print(topic.attrib)

            for read_test in topic.iter('reading-test'):
                r_id = read_test.attrib['r_id']
                #print("\nreading test",r_id)
                doc = read_test.find('doc')
                d_id = doc.attrib['d_id']
                d_str = doc.text#.encode(#'utf-8')

                for questions in read_test.iter('q'):
                    q_id = questions.attrib['q_id']
                    question = questions.find('q_str').text#.encode('utf-8')
                    #print(question)

                    for answers in questions.iter('answer'):
                        rows = []

                        if ('correct' in answers.attrib):
                            correct = 1
                        else:
                            correct = 0
                        #print(correct)
                        a_id = answers.attrib['a_id']
                        a_text = answers.text#.encode('utf-8')
                        #print(a_text)

                        rows.append(yr)
                        rows.append(t_id)
                        rows.append(t_name)
                        rows.append(r_id)                
                        rows.append(d_id)
                        rows.append(d_str)
                        rows.append(q_id)
                        rows.append(question)
                        rows.append(a_id)
                        rows.append(a_text)
                        rows.append(correct)

                        csvwriter.writerow(rows)

        print("Closing...",xmlfile)

    print("Parsing complete!")
    QA_data.close()

In [150]:
#parse_file(test_file,[2011],'./data/qa4mre/qa4mre_tester.csv')

Opening... ./data/qa4mre/2011/QA4MRE-2011-EN_GS.xml
Closing... ./data/qa4mre/2011/QA4MRE-2011-EN_GS.xml
Parsing complete!


In [151]:
test_file = ['./data/qa4mre/2011/QA4MRE-2011-EN_GS.xml']
files = ['./data/qa4mre/2011/QA4MRE-2011-EN_GS.xml','./data/qa4mre/2012/QA4MRE-2012-EN_GS_SYNC.xml',
         './data/qa4mre/2012/QA4MRE-2012-EN_GS.xml','./data/qa4mre/2013/QA4MRE-2013-EN_GS.xml']
years = [2011,2012,2012,2013] # THIS NEEDS TO BE UPDATED AS FILES ARE ADDED/REMOVED

In [None]:
## DON'T REDO THIS, JUST LOAD NOW
parse_file(files,years,'./data/qa4mre/qa4mre.csv')

In [None]:
# FILES TO SAVE FOR LATER
'./data/qa4mre/2012/QA4MRE-2012_BIOMEDICAL_GS.xml','./data/qa4mre/2013/QA4MRE-2013_BIO_GS-RUN.xml',

## Load Data for EDA

In [4]:
qa4mre_df = pd.read_csv("./data/qa4mre/qa4mre.csv") #update as needed

In [5]:
qa4mre_df.head()

Unnamed: 0,YR,T_ID,T_NAME,R_ID,D_ID,D_STR,Q_ID,Q_STR,A_ID,A_STR,CORRECT
0,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,1,What event caused Annie Lennox to commit herse...,1,the imprisonment of Nelson Mandela at Robben I...,0
1,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,1,What event caused Annie Lennox to commit herse...,2,the closing ceremony of Nelson Mandela's Found...,0
2,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,1,What event caused Annie Lennox to commit herse...,3,the meeting with Youssou N'Dour,0
3,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,1,What event caused Annie Lennox to commit herse...,4,the racial segregation in South Africa,0
4,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,1,What event caused Annie Lennox to commit herse...,5,Nelson Mandela's conference to the world press,1


In [6]:
qa4mre_df.groupby(['T_ID']).size()

T_ID
1    960
2    970
3    990
4    700
dtype: int64

In [7]:
qa4mre_df.groupby(['YR','T_ID']).size()

YR    T_ID
2011  1       200
      2       200
      3       200
2012  1       400
      2       400
      3       400
      4       400
2013  1       360
      2       370
      3       390
      4       300
dtype: int64

In [8]:
qa4mre_df.groupby(['T_ID','T_NAME','YR']).size()

T_ID  T_NAME             YR  
1     AIDS               2011    200
                         2012    400
                         2013    360
2     Climate Change     2011    200
                         2012    400
                         2013    370
3     Music and Society  2011    200
                         2012    400
                         2013    390
4     Alzheimer          2012    400
                         2013    300
dtype: int64

In [9]:
qa4mre_df.groupby(['Q_ID']).size()

Q_ID
1     300
2     300
3     300
4     300
5     300
6     300
7     300
8     300
9     300
10    300
11     80
12     80
13     80
14     80
15     80
16     60
17     60
18     60
19     25
20     15
dtype: int64

In [10]:
qa4mre_df.groupby(['CORRECT','A_ID']).size()

CORRECT  A_ID
0        1       582
         2       586
         3       593
         4       583
         5       552
1        1       142
         2       138
         3       131
         4       141
         5       172
dtype: int64

In [11]:
qa4mre_df.groupby(['CORRECT','YR','A_ID']).size()

CORRECT  YR    A_ID
0        2011  1        89
               2        98
               3        97
               4        95
               5       101
         2012  1       258
               2       254
               3       256
               4       248
               5       264
         2013  1       235
               2       234
               3       240
               4       240
               5       187
1        2011  1        31
               2        22
               3        23
               4        25
               5        19
         2012  1        62
               2        66
               3        64
               4        72
               5        56
         2013  1        49
               2        50
               3        44
               4        44
               5        97
dtype: int64

In [12]:
qa4mre_df.groupby(['CORRECT','T_NAME','A_ID']).size()

CORRECT  T_NAME             A_ID
0        AIDS               1       156
                            2       157
                            3       163
                            4       150
                            5       142
         Alzheimer          1       117
                            2       112
                            3       115
                            4       112
                            5       104
         Climate Change     1       162
                            2       150
                            3       161
                            4       156
                            5       147
         Music and Society  1       147
                            2       167
                            3       154
                            4       165
                            5       159
1        AIDS               1        36
                            2        35
                            3        29
                            4        42
       

## Process String Text

In [13]:
qa4mre_df.insert(6,'D_LEN',0)
qa4mre_df.insert(9,"Q_LEN",0)
qa4mre_df.insert(12,"A_LEN",0)

In [14]:
qa4mre_df.head()

Unnamed: 0,YR,T_ID,T_NAME,R_ID,D_ID,D_STR,D_LEN,Q_ID,Q_STR,Q_LEN,A_ID,A_STR,A_LEN,CORRECT
0,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,0,1,What event caused Annie Lennox to commit herse...,0,1,the imprisonment of Nelson Mandela at Robben I...,0,0
1,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,0,1,What event caused Annie Lennox to commit herse...,0,2,the closing ceremony of Nelson Mandela's Found...,0,0
2,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,0,1,What event caused Annie Lennox to commit herse...,0,3,the meeting with Youssou N'Dour,0,0
3,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,0,1,What event caused Annie Lennox to commit herse...,0,4,the racial segregation in South Africa,0,0
4,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,0,1,What event caused Annie Lennox to commit herse...,0,5,Nelson Mandela's conference to the world press,0,1


In [15]:
for i,row in qa4mre_df.iterrows():
    qa4mre_df.at[i,'D_LEN'] = len(nltk.word_tokenize(row['D_STR']))
    qa4mre_df.at[i,'Q_LEN'] = len(nltk.word_tokenize(row['Q_STR']))
    qa4mre_df.at[i,'A_LEN'] = len(nltk.word_tokenize(row['A_STR']))

### Passage Length

In [16]:
qa4mre_df['D_LEN'].max()

4305

In [17]:
qa4mre_df['D_LEN'].min()

958

In [18]:
qa4mre_df['D_LEN'].mean() # This will become a problem for 2013, with variable # of questions

1991.3052486187846

### Question Length

In [19]:
qa4mre_df['Q_LEN'].max()

36

In [20]:
qa4mre_df['Q_LEN'].min()

4

In [21]:
qa4mre_df['Q_LEN'].mean() # Each question is represented 5 times, for each answer

12.113259668508288

### Answer Length

In [22]:
qa4mre_df['A_LEN'].max()

41

In [23]:
qa4mre_df['A_LEN'].min()

1

In [24]:
qa4mre_df['A_LEN'].mean() # Each question is represented 5 times, for each answer

4.702209944751381

In [25]:
qa4mre_df.groupby(['CORRECT']).max()

Unnamed: 0_level_0,YR,T_ID,T_NAME,R_ID,D_ID,D_STR,D_LEN,Q_ID,Q_STR,Q_LEN,A_ID,A_STR,A_LEN
CORRECT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,2013,4,Music and Society,16,16,"Elizabeth Pisani: Sex, drugs and HIV -- let's ...",4305,20,"Why, when playing at the house of Cardinal Ott...",36,5,you could work harder,22
1,2013,4,Music and Society,16,16,"Elizabeth Pisani: Sex, drugs and HIV -- let's ...",4305,20,"Why, when playing at the house of Cardinal Ott...",36,5,young people could listen to pop outside,41


In [117]:
qa4mre_df.to_csv('qa4mre_rnn.csv')

# Test RNN

In [68]:
import numpy
import string
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

## Pre-process Data

In [107]:
test_text = ["Mandela told the world's press that there was a virtual genocide taking place in his country, \
that post-apartheid Rainbow Nation, 1000 were dying on a daily basis, and that the front line victims, \
the most vulnerable of all, were women and children."]
test_question = ["What did Mandela tell the world's press?"]
test_answer = ["That there was a virtual genocide taking place in his country"]

In [108]:
def preprocess(text):
    for story in text:
        result = story.lower()
        result = re.sub(r'\d+', 'NUM', result) # deal with numerals
   # result = # remove punctuation
    
    return(result)

def preprocess2(text):
    results = []
    for story in text:
        result = story.strip()
        result = result.lower()
        result = re.sub(r'\d+', '', result) # deal with numerals
        result = tokenizer.tokenize(result)
        results.append(result)
        
    return(results)

### Train Embeddings

In [78]:
from gensim.models import word2vec
import logging

In [110]:
sentences = word2vec.Text8Corpus('./embeddings/text8')
w2v_model = word2vec.Word2Vec(sentences, size = 200)

In [83]:
w2v_model['boy']

In [112]:
def gen_embed(lst, model):
    results = []
    for story in lst:
        result = [model[l] for l in story]
        results.append(result)
    return(results)

embeds = gen_embed(preprocess2(test_text),w2v_model)

  after removing the cwd from sys.path.


### LSTM  

In [97]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))

In [118]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 20


NameError: name 'recurrent' is not defined

In [116]:
story_maxlen = max(map(len, (x for x in embeds))) # max number of words in story
print(story_maxlen)
query_maxlen = 10
#query_maxlen = max(map(len, (x for x in test_question)))
print(query_maxlen)

43
40


## Annotated Data EDA

In [26]:
# Load File: Corresponds to 2013 dataset
with open("./annotated-data/qa4mre_modfied-jb.json", 'r') as f:
    annotations = json.load(f)
    
annotations[2]
# we have a list of dictionaries:
# { original ID: XX, 
#   annotations: [{skills: [1,2,3], sent_inds: [[1,2],[3,4]], skill_count: #, nonsense: T/F}, ....] # for each question
#   questions: [list of beginning of text of each question],
#   id: "id#"}


# Important to note that Q#s in QA4MRE data is 1-16, 0-15 in annotated dataset 

# ALSO VERY IMPORTANT TO NOTE: NOT ALL QUESTIONS ARE INCLUDED

{'annotations': [{'nonsense': False,
   'sents_indices': [[784, 795]],
   'skill_count': 4,
   'skills': [8, 9, 2, 11]},
  {'nonsense': False,
   'sents_indices': [[598, 611]],
   'skill_count': 1,
   'skills': [2]},
  {'nonsense': False,
   'sents_indices': [[859, 875]],
   'skill_count': 2,
   'skills': [2, 11]},
  {'nonsense': True,
   'sents_indices': [[41, 67]],
   'skill_count': 0,
   'skills': [14]},
  {'nonsense': False,
   'sents_indices': [[93, 116]],
   'skill_count': 2,
   'skills': [9, 11]}],
 'id': 'qa4mre_002',
 'original_id': '4_Alzheimer_3',
 'questions': ['Q: What would an Alzheimer relative envisage w...',
  'Q: Is it possible for Alann...',
  "Q: Name a reason why Alanna's father...",
  'Q: What does the help in getting dress...',
  'Q: How many people with dementia are predicted to exist in t...']}

### Build ID for QA4MRE

In [27]:
for i,row in qa4mre_df.iterrows():
    qa4mre_df.at[i,'ANNOT_ID'] = str(row['T_ID']) + "_" + row['T_NAME'] + "_" + str(row['R_ID']) 
qa4mre_df.head()

Unnamed: 0,YR,T_ID,T_NAME,R_ID,D_ID,D_STR,D_LEN,Q_ID,Q_STR,Q_LEN,A_ID,A_STR,A_LEN,CORRECT,ANNOT_ID
0,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,1581,1,What event caused Annie Lennox to commit herse...,14,1,the imprisonment of Nelson Mandela at Robben I...,8,0,1_AIDS_1
1,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,1581,1,What event caused Annie Lennox to commit herse...,14,2,the closing ceremony of Nelson Mandela's Found...,8,0,1_AIDS_1
2,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,1581,1,What event caused Annie Lennox to commit herse...,14,3,the meeting with Youssou N'Dour,5,0,1_AIDS_1
3,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,1581,1,What event caused Annie Lennox to commit herse...,14,4,the racial segregation in South Africa,6,0,1_AIDS_1
4,2011,1,AIDS,1,1,Annie Lennox Why I am an HIVAIDS activist I'm ...,1581,1,What event caused Annie Lennox to commit herse...,14,5,Nelson Mandela's conference to the world press,8,1,1_AIDS_1


In [27]:
filename = './annotated-data/qa4mre_annotated.csv'

In [66]:
# open a file for writing

QA_data_annotated = open(filename, 'w')

# create the csv writer object

csvwriter = csv.writer(QA_data_annotated)

# write column names
headings = ['ANNOT_ID','Q_ID','NONSENSE','SENTS_INDICES','N_SENTS','SKILLS','SKILL_COUNTS','Q_STR_2']
csvwriter.writerow(headings)

# for each item in the list of annotations opened above:
for i,v in enumerate(annotations):
    # get the Passage ID ("original_id" in dictionary), set to ANNOT_ID
    annot_id = v['original_id']
    
    # Figure out what question we're interested in! 
    filtered = qa4mre_df[qa4mre_df['ANNOT_ID'] == annot_id]
    
    for j,q in enumerate(v['questions']):
        question = q[3:-3]
        
        for k,row in filtered.iterrows():
            if question in row['Q_STR']:
                rows = []
               
                q_id = row['Q_ID']
                
                nonsense = v['annotations'][j]['nonsense']
                sents_indices = v['annotations'][j]['sents_indices']
                n_sents = len(v['annotations'][j]['sents_indices'])
                skill_count = v['annotations'][j]['skill_count']
                skills = v['annotations'][j]['skills']
                
                rows.append(annot_id)
                rows.append(q_id)
                rows.append(nonsense)
                rows.append(sents_indices)
                rows.append(n_sents)
                rows.append(skill_count)
                rows.append(skills)
                rows.append(question)

                csvwriter.writerow(rows)

print("Parsing complete!")
QA_data_annotated.close()

Parsing complete!


### Merge!

In [68]:
qa4mre_annotated_df = pd.read_csv("./annotated-data/qa4mre_annotated.csv") #update as needed

In [69]:
df_dummy = pd.merge(left=qa4mre_df, right=qa4mre_annotated_df, how='left', on=['ANNOT_ID', 'Q_ID'])

#### Save as CSV

In [None]:
## pd.....

Unnamed: 0_level_0,YR,T_ID,T_NAME,R_ID,D_ID,D_STR,D_LEN,Q_STR,Q_LEN,A_ID,A_STR,A_LEN,CORRECT,ANNOT_ID,NONSENSE,SENTS_INDICES,N_SENTS,SKILLS,SKILL_COUNTS,Q_STR_2
Q_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,420,420,420,420,420,420,420,420,420,420,420,420,420,420,150,150,150,150,150,150
2,420,420,420,420,420,420,420,420,420,420,420,420,420,420,150,150,150,150,150,150
3,420,420,420,420,420,420,420,420,420,420,420,420,420,420,150,150,150,150,150,150
4,420,420,420,420,420,420,420,420,420,420,420,420,420,420,150,150,150,150,150,150
5,460,460,460,460,460,460,460,460,460,460,460,460,460,460,200,200,200,200,200,200
6,440,440,440,440,440,440,440,440,440,440,440,440,440,440,175,175,175,175,175,175
7,420,420,420,420,420,420,420,420,420,420,420,420,420,420,150,150,150,150,150,150
8,440,440,440,440,440,440,440,440,440,440,440,440,440,440,175,175,175,175,175,175
9,380,380,380,380,380,380,380,380,380,380,380,380,380,380,100,100,100,100,100,100
10,575,575,575,575,575,575,575,575,575,575,575,575,575,575,325,325,325,325,325,325
