# Random Selection of Sentences and Paragraphs
Data Preperation for Pipelines 3 & 4



In [None]:
import os
import pandas as pd
from ast import literal_eval

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Data

In [None]:
data_path = '/content/drive/MyDrive/Text-Mining/Data/before_shortening/'

In [None]:
# load test and train data
train_data = pd.read_csv(os.path.join(data_path,'train_with_rouge_sent.tsv'), sep='\t')
test_data = pd.read_csv(os.path.join(data_path,'test_with_rouge_sent.tsv'), sep='\t')

In [None]:
# inspect test and train data
print("Shape of train data: ", train_data.shape)
print("Shape of test data: ", test_data.shape)
print("Head of test data:")
test_data.head(3) # 150 rows

Shape of train data:  (10148, 11)
Shape of test data:  (150, 11)
Head of test data:


Unnamed: 0.1,Unnamed: 0,id,title,abstract,body,highlights,key_phrases,part_of_section,rouge_scores_sentences,sentence_lengths,sentences
0,0,S0003687013000549,Wrist posture affects hand and forearm muscle ...,,Non-neutral wrist posture is a risk factor of ...,We quantified the effect of four wrist posture...,Biomechanical model\nMuscle stress\nOptimizati...,"['@&#ABSTRACT@&#', '@&#ABSTRACT@&#', '@&#ABSTR...","[0.019940179264201417, 0.019940179225195802, 0...","[15, 15, 28, 13, 27, 31, 27, 21, 17, 23, 20, 1...",['Non-neutral wrist posture is a risk factor o...
1,1,S0003687013000550,Variation in work tasks in relation to pinch g...,,Objectives We aimed to investigate the relatio...,The relationship of task variation during dent...,Pinch grip strength\nSymptomatic hand OA\nDent...,"['@&#ABSTRACT@&#', '@&#ABSTRACT@&#', '@&#ABSTR...","[0.034782608298088644, 0.03478260832593993, 0....","[17, 18, 14, 54, 14, 17, 9, 18, 75, 21, 6, 17,...",['Objectives We aimed to investigate the relat...
2,2,S0003687013000562,The development of guidelines for the design a...,,We report a study which aimed to provide furth...,We carried out an evaluation of a set of pilot...,Warnings\nSigns\nChildren\nSafety\nGuidelines\...,"['@&#ABSTRACT@&#', '@&#ABSTRACT@&#', '@&#ABSTR...","[0.0579268286485444, 0.012480499065179458, 0.0...","[25, 12, 30, 17, 35, 17, 12, 23, 28, 18, 23, 1...",['We report a study which aimed to provide fur...


## Random extraction of sentences
Pipeline 4:
- Random extraction of sentences so that the length will be approx. 3072 tokens
- train/test split
- Finetune Bigbird Pegasus with randomly extracted sentences
- Apply finetunes Bigbird Pegasus --> abstractive summary

TODO:
- Take the following columns from the original data files: index, id, title, abstract, body, highlights, key-phrases
- Generate a list of random sentences from the body
- Add the column 'shortened articles'

In [None]:
# create new dataframes (copy relevant columns)
train_shortened_sent = train_data[['id','title', 'abstract', 'body', 'highlights', 'key_phrases']].copy()
test_shortened_sent = test_data[['id','title', 'abstract', 'body', 'highlights', 'key_phrases']].copy()
print("Shape of new train dataframe: ", train_shortened_sent.shape)
print("Shape of new test dataframe: ", test_shortened_sent.shape)
test_shortened_sent.head(3)

Shape of new train dataframe:  (10148, 6)
Shape of new test dataframe:  (150, 6)


Unnamed: 0,id,title,abstract,body,highlights,key_phrases
0,S0003687013000549,Wrist posture affects hand and forearm muscle ...,,Non-neutral wrist posture is a risk factor of ...,We quantified the effect of four wrist posture...,Biomechanical model\nMuscle stress\nOptimizati...
1,S0003687013000550,Variation in work tasks in relation to pinch g...,,Objectives We aimed to investigate the relatio...,The relationship of task variation during dent...,Pinch grip strength\nSymptomatic hand OA\nDent...
2,S0003687013000562,The development of guidelines for the design a...,,We report a study which aimed to provide furth...,We carried out an evaluation of a set of pilot...,Warnings\nSigns\nChildren\nSafety\nGuidelines\...


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk import tokenize
import random
import re

In [None]:
def random_sent(text):
  "Takes the body of an article and returns a list of random sentences"
  # split body into sentences and shuffle them
  sent_list = tokenize.sent_tokenize(text)
  random.shuffle(sent_list)
  random_sents = []

  # fill random sents_with sentences until the specified amount of tokens has been reached
  for sent in sent_list:
    tokenized = re.findall("[A-Z\-\']{2,}(?![a-z])|[A-Z\-\'][a-z\-\']+(?=[A-Z])|[\'\w\-]+",' '.join(random_sents))
    if len(tokenized) <= 3300: # approximation
    #if len(tokenize.word_tokenize(' '.join(random_sents))) <= 3072: # too slow
      random_sents.append(sent)
  else:
    pass


  return ' '.join(random_sents)

Note: I'm using a 'manual' regex tokenizer in the for loop because nltk is taking way too long. The resulting sequences will bee a bit too long and BigBird will cut them but we still have the desired effect of the shuffled sentences.

In [None]:
# apply the funciton to each row of the dataframes and append a new column with
# the shortened text to each dataframe
train_shortened_sent['shortened_articles'] = train_shortened_sent['body'].apply(random_sent)

In [None]:
test_shortened_sent['shortened_articles'] = test_shortened_sent['body'].apply(random_sent)

In [None]:
print("Shape of new train dataframe: ", train_shortened_sent.shape)
print("Shape of new test dataframe: ", test_shortened_sent.shape)
test_shortened_sent.head(3)

Shape of new train dataframe:  (10148, 7)
Shape of new test dataframe:  (150, 7)


Unnamed: 0,id,title,abstract,body,highlights,key_phrases,shortened_articles
0,S0003687013000549,Wrist posture affects hand and forearm muscle ...,,Non-neutral wrist posture is a risk factor of ...,We quantified the effect of four wrist posture...,Biomechanical model\nMuscle stress\nOptimizati...,The activity patterns of the intrinsic muscles...
1,S0003687013000550,Variation in work tasks in relation to pinch g...,,Objectives We aimed to investigate the relatio...,The relationship of task variation during dent...,Pinch grip strength\nSymptomatic hand OA\nDent...,OA-related impairment in grip force production...
2,S0003687013000562,The development of guidelines for the design a...,,We report a study which aimed to provide furth...,We carried out an evaluation of a set of pilot...,Warnings\nSigns\nChildren\nSafety\nGuidelines\...,The aim of the current study is to firm up the...


In [None]:
# path to save file
shortened_sent_path = '/content/drive/MyDrive/Text-Mining/Data/sentence_selection_random/'

In [None]:
# saving as tsv files
train_shortened_sent.to_csv(os.path.join(shortened_sent_path, 'train_with_random_sent_sel_3072.tsv'), sep="\t")

In [None]:
test_shortened_sent.to_csv(os.path.join(shortened_sent_path, 'test_with_random_sent_sel_3072.tsv'), sep="\t")

## Random extraction of paragraphs
Pipeline 3:
- Random extraction of paragraphs so that the length will be approx. 3072 tokens
- train/test split
- Finetune Bigbird Pegasus with randomly extracted paragraphs
- Apply finetunes Bigbird Pegasus --> abstractive summary

TODO:
- Take the following columns from the original data files: index, id, title, abstract, body, highlights, key-phrases
- Generate a list of random paragraphs from the body
- Add the column 'shortened articles'

In [None]:
# create new dataframes (copy relevant columns)
train_shortened_par = train_data[['id','title', 'abstract', 'body', 'highlights', 'key_phrases']].copy()
test_shortened_par = test_data[['id','title', 'abstract', 'body', 'highlights', 'key_phrases']].copy()
print("Shape of new train dataframe: ", train_shortened_par.shape)
print("Shape of new test dataframe: ", test_shortened_par.shape)
test_shortened_par.head(3)

Shape of new train dataframe:  (10148, 6)
Shape of new test dataframe:  (150, 6)


Unnamed: 0,id,title,abstract,body,highlights,key_phrases
0,S0003687013000549,Wrist posture affects hand and forearm muscle ...,,Non-neutral wrist posture is a risk factor of ...,We quantified the effect of four wrist posture...,Biomechanical model\nMuscle stress\nOptimizati...
1,S0003687013000550,Variation in work tasks in relation to pinch g...,,Objectives We aimed to investigate the relatio...,The relationship of task variation during dent...,Pinch grip strength\nSymptomatic hand OA\nDent...
2,S0003687013000562,The development of guidelines for the design a...,,We report a study which aimed to provide furth...,We carried out an evaluation of a set of pilot...,Warnings\nSigns\nChildren\nSafety\nGuidelines\...


In [None]:
import re

In [None]:
def random_paragraph(text):
  "Takes the body of an article and returns a list of random paragraphs"
  # split body into paragraphs and shuffle them
  paragraph_list = text.split('\n')
  random.shuffle(paragraph_list)
  random_paragraphs = []

  # fill random_paragraphs with sentences until the specified amount of tokens has been reached
  for par in paragraph_list:
    # check how many tokens already in random paragraphs list to avoid looping too often (manual tokenizer)
    tokenized = re.findall("[A-Z\-\']{2,}(?![a-z])|[A-Z\-\'][a-z\-\']+(?=[A-Z])|[\'\w\-]+",' '.join(random_paragraphs))
    #if len(tokenize.word_tokenize(' '.join(random_paragraphs))) <= 3072: # too slow
    if len(tokenized) <= 3300:
      random_paragraphs.append(par)
  else:
    pass


  return '\n'.join(random_paragraphs)


Note: Like above, this results in strings that are slightly too long and will be cut off by BigBird Pegasus due to the regex tokenizer but we still have the desired effect of the shuffled paragraphs.

In [None]:
# apply the funciton to each row of the dataframes and append a new column with
# the shortened text to each dataframe
test_shortened_par['shortened_articles'] = test_shortened_par['body'].apply(random_paragraph)

In [None]:
train_shortened_par['shortened_articles'] = train_shortened_par['body'].apply(random_paragraph)

In [None]:
print("Shape of new train dataframe: ", train_shortened_par.shape)
print("Shape of new test dataframe: ", test_shortened_par.shape)
test_shortened_par.head(3)

Shape of new train dataframe:  (10148, 7)
Shape of new test dataframe:  (150, 7)


Unnamed: 0,id,title,abstract,body,highlights,key_phrases,shortened_articles
0,S0003687013000549,Wrist posture affects hand and forearm muscle ...,,Non-neutral wrist posture is a risk factor of ...,We quantified the effect of four wrist posture...,Biomechanical model\nMuscle stress\nOptimizati...,Other studies measured muscle loading using el...
1,S0003687013000550,Variation in work tasks in relation to pinch g...,,Objectives We aimed to investigate the relatio...,The relationship of task variation during dent...,Pinch grip strength\nSymptomatic hand OA\nDent...,Chronic diseases affecting the hand may also l...
2,S0003687013000562,The development of guidelines for the design a...,,We report a study which aimed to provide furth...,We carried out an evaluation of a set of pilot...,Warnings\nSigns\nChildren\nSafety\nGuidelines\...,A final area for future work is the need to vi...


In [None]:
# path to save files
shortened_paragraph_path = '/content/drive/MyDrive/Text-Mining/Data/paragraph_selection_random/'

In [None]:
# saving as tsv files
test_shortened_par.to_csv(os.path.join(shortened_paragraph_path, 'test_with_random_paragraph_sel_3072.tsv'), sep="\t")

In [None]:
train_shortened_par.to_csv(os.path.join(shortened_paragraph_path, 'train_with_random_paragraph_sel_3072.tsv'), sep="\t")