# Preprocessing the abstracts

The first step is to go from unicode to ASCII.
The mappings are provided by another project.

In [1]:
from utils import (
    DATALOCATION_WIDGET, 
    get_data_dir,
    load_if_not_present, 
    get_logger, 
    save_df, 
    SESSIONLOCATION_WIDGET
)

from abstract_manipulations import (
    load_unicode_mappings,
    unicode2ascii,
    replace_abstract_abbreviations,
    remove_brackets
)

from constants import (
    TEXT_COLUMNS
)
from tqdm.auto import tqdm

tqdm.pandas()
abstracts_df = None

logger = get_logger("preprocess_abstracts")
logger.info("Loaded modules")
unabbreviated_abstracts = None

Created logger with name: utils
Created logger with name: preprocess_abstracts
2019-05-28 13:20:29,011 : INFO : Created logger with name: preprocess_abstracts
Loaded modules
2019-05-28 13:20:29,012 : INFO : Loaded modules


## Unicode to ASCII

In [2]:
abstracts_df = load_if_not_present(abstracts_df, "raw_abstracts.csv")

Loaded raw_abstracts.csv from csv.
2019-05-28 13:20:29,372 : DEBUG : Loaded raw_abstracts.csv from csv.


In [3]:
mappings = load_unicode_mappings(get_data_dir())

In [7]:
abstracts_df.head()

Unnamed: 0,background,objective,methods,conclusions,results,title,journal,pmid,date
0,The TTM suggests that individuals in the actio...,To explore sexual behaviours and condom use an...,"This was a descriptive, cross-sectional design...",Low levels of condom use among Taiwanese colle...,"Of the 279 participants, 57% were sexually act...",Stages of condom use and decisional balance am...,International nursing review,19702809,1268348000.0
1,The most significant prognostic factors for pa...,,A total of 39 consecutive patients underwent a...,"Our surgical technique, tMPDe, is safe and mor...",The tMPDe procedure was performed safely witho...,Appraisal of a total meso-pancreatoduodenum ex...,European journal of surgical oncology : the jo...,22575529,1344809000.0
2,,To investigate the efficacy and safety of the ...,"This was a multi-centre, phase 3, randomized, ...",Linagliptin as add-on therapy to metformin and...,The placebo-corrected adjusted mean (se) chang...,Linagliptin improved glycaemic control without...,Diabetic medicine : a journal of the British D...,24824197,1436479000.0
3,,The study objective is to evaluate the clinica...,This is a retrospective case series of 17 adul...,Aggressive surgical management to achieve clea...,Chart review of the 17 identified subjects rev...,Malignant fibrous histiocytoma of the head and...,American journal of otolaryngology,22999710,1368482000.0
4,,To catalog the side effects of 2.4 atmospheres...,Fifty subjects diagnosed with TBI were randomi...,This study demonstrated no major adverse event...,These side effects were observed as rate (sham...,Hyperbaric side effects in a traumatic brain i...,Undersea & hyperbaric medicine : journal of th...,23342764,1359414000.0


In [11]:
ascii_abstracts = abstracts_df.copy()
ascii_abstracts.iloc[:,0:6] = abstracts_df[TEXT_COLUMNS].progress_applymap(
    lambda x: unicode2ascii(mappings, str(x)))
save_df(ascii_abstracts, "ascii_abstracts.csv")

HBox(children=(IntProgress(value=0, max=98931), HTML(value='')))




## Abbreviation resolution

In [12]:
unabbreviated_abstracts = ascii_abstracts.copy()
unabbreviated_abstracts.iloc[:, 0:6] = ascii_abstracts.iloc[:, 0:6].progress_apply(
    axis=1, func=replace_abstract_abbreviations)

save_df(unabbreviated_abstracts, "unabbreviated_abstracts.csv")

HBox(children=(IntProgress(value=0, max=14133), HTML(value='')))




In [13]:
unabbreviated_abstracts.head()

Unnamed: 0,background,objective,methods,conclusions,results,title,journal,pmid,date
0,The Transtheoretical Model suggests that indi...,To explore sexual behaviours and condom use an...,"This was a descriptive, cross-sectional design...",Low levels of condom use among Taiwanese colle...,"Of the 279 participants, 57% were sexually act...",Stages of condom use and decisional balance am...,International nursing review,19702809,1268348000.0
1,The most significant prognostic factors for pa...,,A total of 39 consecutive patients underwent a...,"Our surgical technique, tMPDe, is safe and mor...",The tMPDe procedure was performed safely witho...,Appraisal of a total meso-pancreatoduodenum ex...,European journal of surgical oncology : the jo...,22575529,1344809000.0
2,,To investigate the efficacy and safety of the ...,"This was a multi-centre, phase3, randomized, d...",Linagliptin as add-on therapy to metformin and...,The placebo-corrected adjusted mean (se) chang...,Linagliptin improved glycaemic control without...,Diabetic medicine : a journal of the British D...,24824197,1436479000.0
3,,The study objective is to evaluate the clinica...,This is a retrospective case series of 17 adul...,Aggressive surgical management to achieve clea...,Chart review of the 17 identified subjects rev...,Malignant fibrous histiocytoma of the head and...,American journal of otolaryngology,22999710,1368482000.0
4,,To catalog the side effects of 2.4 atmospheres...,Fifty subjects diagnosed with traumatic brain...,This study demonstrated no major adverse event...,These side effects were observed as rate (sham...,Hyperbaric side effects in a traumatic brain i...,Undersea & hyperbaric medicine : journal of th...,23342764,1359414000.0


unabbreviated_abstracts

In [15]:
abstracts_noparens = load_if_not_present(unabbreviated_abstracts, "unabbreviated_abstracts.csv")
abstracts_noparens.iloc[:,0:6] = abstracts_noparens[TEXT_COLUMNS].progress_applymap(remove_brackets)
save_df(abstracts_noparens, "abstracts_noparens.csv")

Reusing previously defined df.
2019-05-28 13:23:56,550 : DEBUG : Reusing previously defined df.


HBox(children=(IntProgress(value=0, max=98931), HTML(value='')))




In [16]:
abstracts_noparens.head()

Unnamed: 0,background,objective,methods,conclusions,results,title,journal,pmid,date
0,The Transtheoretical Model suggests that indiv...,To explore sexual behaviours and condom use an...,"This was a descriptive, cross-sectional design...",Low levels of condom use among Taiwanese colle...,"Of the 279 participants, 57% were sexually act...",Stages of condom use and decisional balance am...,International nursing review,19702809,1268348000.0
1,The most significant prognostic factors for pa...,,A total of 39 consecutive patients underwent a...,"Our surgical technique, tMPDe, is safe and mor...",The tMPDe procedure was performed safely witho...,Appraisal of a total meso-pancreatoduodenum ex...,European journal of surgical oncology : the jo...,22575529,1344809000.0
2,,To investigate the efficacy and safety of the ...,"This was a multi-centre, phase3, randomized, d...",Linagliptin as add-on therapy to metformin and...,The placebo-corrected adjusted mean change in ...,Linagliptin improved glycaemic control without...,Diabetic medicine : a journal of the British D...,24824197,1436479000.0
3,,The study objective is to evaluate the clinica...,This is a retrospective case series of 17 adul...,Aggressive surgical management to achieve clea...,Chart review of the 17 identified subjects rev...,Malignant fibrous histiocytoma of the head and...,American journal of otolaryngology,22999710,1368482000.0
4,,To catalog the side effects of 2.4 atmospheres...,Fifty subjects diagnosed with traumatic brain ...,This study demonstrated no major adverse event...,These side effects were observed as rate : ear...,Hyperbaric side effects in a traumatic brain i...,Undersea & hyperbaric medicine : journal of th...,23342764,1359414000.0
