# Evaluate the code to extract questions and Likert scales from PDFs

In [23]:
import os
import pickle as pkl
import re

import sys

sys.path.append("../front_end")

from langdetect import detect

from utils.options_extractor import add_candidate_options
from utils.question_extractor import QuestionExtractor
from utils.question_extractor import process_text, convert_to_dataframe

INPUT_FOLDER = "../data/preprocessed_pkl/"
question_extractor = QuestionExtractor()


GOLD_FOLDER = "../data/gold_standard_extraction/"
gold_files = set(os.listdir(GOLD_FOLDER))

GOLD_FOLDER_2 = "../front_end/hard_coded_questionnaires/"
gold_files = gold_files.union(set(os.listdir(GOLD_FOLDER)))

OUTPUT_FOLDER = "output_question_extraction/"

try:
    os.stat(OUTPUT_FOLDER)
except:
    os.mkdir(OUTPUT_FOLDER)

file_to_pages = {}
for root, folder, files in os.walk(INPUT_FOLDER):
    for file_name in files:
        if not file_name.endswith("pkl"):
            continue
        print(file_name)
        
        pdf_file = re.sub(".pkl", "", file_name)
        
        annotations_file = re.sub(".pdf", ".csv", pdf_file)
        if annotations_file not in gold_files:
            continue

        full_file = INPUT_FOLDER + "/" + file_name
        #         print (full_file)
        with open(full_file, 'rb') as f:
            pages = pkl.load(f)
        file_to_pages[pdf_file] = pages

Beck Depression Inventory English.pdf.pkl
CES_D English.pdf.pkl
UCLA Loneliness Scale English.pdf.pkl
APA DSM5 Severity Measure for Depression English.pdf.pkl
Beck Depression Inventory Portuguese.pdf.pkl
De Jong Gierveld Loneliness Scale English.pdf.pkl
GAD-7 English.pdf.pkl
RCADS Child Reported English.pdf.pkl
Adult ADHD Self-Report Scale English.pdf.pkl
SCARED English.pdf.pkl


In [25]:
for file, pages in file_to_pages.items():
    # if file != "BHRCS_Parent_report_YSI.pdf":
    #     continue
    print("Processing", file)
    text = "\n".join(pages)

    language = detect(text)

    doc = process_text(text, language)

    df = convert_to_dataframe(doc)

    df_questions = question_extractor.get_questions(df)

    add_candidate_options(df_questions, doc)

    with open(OUTPUT_FOLDER +  re.sub(r'.pdf', '.csv', file), "w", encoding="utf-8") as f:
        for i in range(len(df_questions)):
            f.write(str(df_questions.question_no.iloc[i]) + "\t" + str(df_questions.question.iloc[i]) + "\t" + str(
                df_questions.options.iloc[i]) + "\n")

Processing Beck Depression Inventory English.pdf
31
40
46
64
83
95
105
118
137
147
161
182
199
214
228
242
256
265
278
290
304
316
326
335
348
359
368
377
387
402
416
429
444
457
475
485
501
513
525
535
561
576
588
604
617
629
645
659
676
690
704
720
735
752
766
784
797
809
824
838
853
864
878
901
925
937
950
961
975
986
1001
1011
1025
1040
1051
1062
1076
1090
1111
1131
1154
1170
1185
1196
Processing UCLA Loneliness Scale English.pdf


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred.rename(columns={"preceding_bullet_value": "question_no"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_questions["options_spans"] = candidate_options_per_question
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_questions["options"] = df_questions["options_spans"].apply(get_correctly_ordered_options_text)
A value is trying to 

Processing APA DSM5 Severity Measure for Depression English.pdf
141
513
526
537
553
565
576
605
623
661
857
Processing Beck Depression Inventory Portuguese.pdf
23
31
41
57
73
93
110
119
134
145
171
182
196
210
229
251
263
273
286
300
314
326
346
352
367
387
402
418
434
442
455
465
472
479
497
511
533
549
563
582
590
604
619
632
643
653
669
694
707
720
726
746
759
773
792
822
835
852
868
885
899
914
933
953
965
983
993
1009
1034
1044
1057
1078
1105
1118
1130
1165
1177
1191
1205
1222
1238
1249
1260
1275
1291
1323
1351
1369
1384
1398
1409


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred.rename(columns={"preceding_bullet_value": "question_no"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_questions["options_spans"] = candidate_options_per_question
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_questions["options"] = df_questions["options_spans"].apply(get_correctly_ordered_options_text)
A value is trying to 

In [26]:
import os, re

In [27]:
gold_standards_and_outputs = []
for f in os.listdir(GOLD_FOLDER):
    
    with open(GOLD_FOLDER + f, "r", encoding="utf-8") as fi:
        gold_standard = list([l.strip() for l in fi if len(l.strip()) > 0])
    with open(OUTPUT_FOLDER + f, "r", encoding="utf-8") as fi:
        model_output = list([l.strip() for l in fi if len(l.strip()) > 0])
    
    gold_standards_and_outputs.append((f, gold_standard, model_output))

In [30]:
print ("Correct\tIncorrect\tFile")
for f, gold_standard, model_output in gold_standards_and_outputs:
    correct = set(gold_standard).intersection(set(model_output))
    incorrect = set(gold_standard) - set(model_output)
    print (len(correct),"\t", len(incorrect), "\t", f)
    

Correct	Incorrect	File
83 	 2 	 Beck Depression Inventory English.csv
0 	 93 	 Beck Depression Inventory Portuguese.csv
0 	 21 	 UCLA Loneliness Scale English.csv
8 	 2 	 APA DSM5 Severity Measure for Depression English.csv


In [31]:
print ("Correct\tIncorrect\tFile")
for f, gold_standard, model_output in gold_standards_and_outputs:
    
    gold_standard = set([s.split("\t")[1] for s in gold_standard])
    model_output = set([s.split("\t")[1] for s in model_output])
    
    correct = set(gold_standard).intersection(set(model_output))
    incorrect = set(gold_standard) - set(model_output)
    print (len(correct),"\t", len(incorrect), "\t", f)


Correct	Incorrect	File
83 	 2 	 Beck Depression Inventory English.csv
70 	 23 	 Beck Depression Inventory Portuguese.csv
0 	 21 	 UCLA Loneliness Scale English.csv
8 	 2 	 APA DSM5 Severity Measure for Depression English.csv


In [32]:
gold_standard

{'Feeling bad about yourself—or feeling that you are a failure, or that you have let yourself or your family down?',
 'Feeling down, depressed, irritable, or hopeless?',
 'Feeling tired, or having little energy?',
 'Little interest or pleasure in doing things?',
 'Moving or speaking so slowly that other people could have noticed? Or the opposite—being so fidgety or restless that you were moving around a lot more than usual?',
 'Poor appetite, weight loss, or overeating?',
 'Question',
 'Thoughts that you would be better off dead, or of hurting yourself in some way?',
 'Trouble concentrating on things like school work, reading, or watching TV?',
 'Trouble falling asleep, staying asleep, or sleeping too much?'}

In [33]:
model_output

{'Feeling bad about yourself—or feeling that you are a failure, or that you have let yourself or your family down?',
 'Feeling down, depressed, irritable, or hopeless?',
 'Feeling tired, or having little energy?',
 'Little interest or pleasure in doing things?',
 'Moving or speaking so slowly that other people could have noticed?',
 'Nearly every day).',
 'Poor appetite, weight loss, or overeating?',
 'Thoughts that you would be better off dead, or of hurting yourself in some way?',
 'Trouble concentrating on things like school work, reading, or watching TV?',
 'Trouble falling asleep, staying asleep, or sleeping too much?',
 'modified for Adolescents'}

In [34]:
for s in model_output:
    if s not in gold_standard:
        print (s)

Moving or speaking so slowly that other people could have noticed?
modified for Adolescents
Nearly every day).


In [35]:
for s in gold_standard:
    if s not in model_output:
        print (s)

Moving or speaking so slowly that other people could have noticed? Or the opposite—being so fidgety or restless that you were moving around a lot more than usual?
Question
