# Extractive Summarization of Research Papers
Team Members:<br>
<ul>
    <li>Janani Arunachalam</li>
    <li>Kevin Thomas</li>
</ul>

In [0]:
# --- Mounting drive --- 


from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# --- tqdm version 4.36.1 is required --- 


!pip install tqdm==4.36.1

Collecting tqdm==4.36.1
[?25l  Downloading https://files.pythonhosted.org/packages/e1/c1/bc1dba38b48f4ae3c4428aea669c5e27bd5a7642a74c8348451e0bd8ff86/tqdm-4.36.1-py2.py3-none-any.whl (52kB)
[K     |██████▏                         | 10kB 16.9MB/s eta 0:00:01[K     |████████████▍                   | 20kB 3.2MB/s eta 0:00:01[K     |██████████████████▋             | 30kB 4.6MB/s eta 0:00:01[K     |████████████████████████▉       | 40kB 3.0MB/s eta 0:00:01[K     |███████████████████████████████ | 51kB 3.6MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.9MB/s 
[?25hInstalling collected packages: tqdm
  Found existing installation: tqdm 4.28.1
    Uninstalling tqdm-4.28.1:
      Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.36.1


In [0]:
!pip install rogue

Collecting rogue
  Downloading https://files.pythonhosted.org/packages/c9/a9/02b89c5a14497a8d24f73bb2a34532a5067b4951c0aaaefead57129e915a/rogue-0.0.2.tar.gz
Building wheels for collected packages: rogue
  Building wheel for rogue (setup.py) ... [?25l[?25hdone
  Created wheel for rogue: filename=rogue-0.0.2-cp36-none-any.whl size=7213 sha256=d746f6a42a6a07354c90ae4b661e2362663fe646cf2a453356fdbf71b6fd82e2
  Stored in directory: /root/.cache/pip/wheels/02/39/20/d62d788c77e226459e17e14df0d04aa5e48ef87d1e05cd06e5
Successfully built rogue
Installing collected packages: rogue
Successfully installed rogue-0.0.2


In [0]:
# --- Import essential packages --- 


import numpy as np
import pandas as pd
import os
import pickle
import re
from sklearn.datasets import load_files
import glob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from tqdm.auto import tqdm

tqdm.pandas()

In [0]:
# --- Implementing pretrained word embeddings --- 



from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec


glove_file = datapath(os.path.abspath('/content/drive/My Drive/NLP Project/Project Final/glove.6B.50d.txt'))
tmp_file = get_tmpfile(os.path.abspath("test_word2vec.txt"))
converted_file = glove2word2vec(glove_file, tmp_file) 


# Source: https://radimrehurek.com/gensim/scripts/glove2word2vec.html

In [0]:
# --- Loading the Glove embeddings in word2vec format ---


glove_model = KeyedVectors.load_word2vec_format(os.path.abspath("test_word2vec.txt"))

In [0]:
glove_model["the"]  # Checking

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

In [0]:
# --- Stop word removal function --- 


from spacy.lang.en.stop_words import STOP_WORDS

def stop_word_remove(sentence):
    temp = [token for token in sentence.split() if token not in STOP_WORDS]
    return ' '.join(word for word in temp)

In [0]:
# --- Function to read the papers from their paths --- 


def read_paper(path):
  f = open(path, 'r', encoding="utf-8")
  text = str(f.read())
  f.close()
  return text

In [0]:
# --- Function to preprocess the papers --- 


def process_paper(text):

  # Removes unwanted characters, accounting for unicode characters
  text = re.sub("@&#", " ", text)
  text = re.sub("\n", " ", text)
  text = (text.encode('ascii', 'ignore')).decode("utf-8")

  # Extracting the highlights, body from the paper
  highlights = re.findall(r'HIGHLIGHTS(.*?)KEYPHRASES', text,  flags = re.I)[0]
  body_main = re.findall(r'.*(?:abstract)(.*?)', text, flags=re.I)[0]

  # Making a copy of the body, lowercasing body text, removing punctuations & extra spaces
  dummy_body = body_main.lower()
  dummy_body = re.sub('[^\w\s\d\.]','',dummy_body)
  dummy_body = ' '.join(dummy_body.split())
  dummy_body = dummy_body.split(".")

  # Removing extra spaces from the body text, which will be preserved to produce summaries
  # And splitting into sentences
  body = ' '.join(body_main.split())
  body = body.split(".")

  # Removing sentences that are too short or too long, as they wouldn't make apt summary text
  for i,x in enumerate(dummy_body):
    if (len(x.split())) < 3 or (len(x.split())) > 15: 
        dummy_body.pop(i)
        body.pop(i)

  # Making a copy of the highlights, lowercasing body text, removing punctuations & extra spaces
  dummy_highlights = highlights.lower()
  dummy_highlights = re.sub('[^\w\s\d]','',dummy_highlights)
  dummy_highlights = ' '.join(dummy_highlights.split())

  # Removing stop words from body & highlights
  body_copy = []
  for x in dummy_body:
    body_copy.append(stop_word_remove(x))
  highlight_copy = []
  for x in dummy_highlights.split():
      highlight_copy.append(stop_word_remove(x))

  
  # Combing all of the highlights into one string    
  highlight_copy = " ".join(sentence for sentence in highlight_copy)
  highlight_copy = " ".join(highlight_copy.split())

  return body_main, body_copy, highlights, highlight_copy

In [0]:
from scipy.special import expit


# Function to calculate sentence Score
def document_score(body_copy, highlight_copy):
  # Getting word vectors for the body
  body_vectors = []
  for sent in body_copy:
      sent_vec = []
      for word in sent.split():
          try:
              sent_vec.append(glove_model[word])
          # If the word vector isn't there in the model
          # then use the vector of the word "Visual"
          except:
              sent_vec.append(glove_model["visual"])
      body_vectors.append(sent_vec)

  # Getting word vectors for the highlights
  highlight_vectors = []
  for word in highlight_copy.split():
      try:
          highlight_vectors.append(glove_model[word])
      except:
          highlight_vectors.append(glove_model["visual"])

  # Finding the rouge score for each sentence by counting the # of common words
  # & dividing by length of sentence
  doc_score = []
  for sent in body_vectors:
      sent_score = 0
      for word in sent:
          for w in highlight_vectors:
              if (word == w).all():
                  sent_score+=1
      try: 
        doc_score.append(expit(sent_score/len(sent)))
      except:
        doc_score.append(0)
  return doc_score

In [0]:
# Import libraries

from gensim.models import doc2vec
from collections import namedtuple


# Function to create document vectors
def create_document_vector(body_main, doc_score):
  # Load data
  doc1 = [body_main]

  # Transforming data
  docs = []
  analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
  for i, d in enumerate(doc1):
      words = d.lower().split()
      tags = [i]
      docs.append(analyzedDocument(words, tags))

  # Training model
  model = doc2vec.Doc2Vec(docs, size = 100, window = 300, min_count = 1, workers = 4)

  # Getting vectors
  doc_vec = model.docvecs[0]

  doc_vectors = []
  for i in range(len(doc_score)):
    doc_vectors.append(doc_vec)
  return doc_vectors

In [0]:
# Function to create sentence vectors
def create_sentence_vectors(body_copy):
  doc2 = body_copy

  # Transforming data 
  docu = []
  analyzed = namedtuple('Analyzed', 'words tags')
  for i, f in enumerate(doc2):
      wor = f.split()
      tags = [i]
      docu.append(analyzed(wor, tags))

  # Training model
  model = doc2vec.Doc2Vec(docu, size = 100, window = 300, min_count = 1, workers = 4)

  # Getting vectors
  sent_vectors = model.docvecs.vectors_docs
  return list(sent_vectors)

In [0]:
# Function to create datasets
def create_data(path):
  text = read_paper(path)
  body_main, body_copy, highlights, highlight_copy = process_paper(text)
  doc_score = document_score(body_copy, highlight_copy)
  doc_vectors = create_document_vector(body_main, doc_score)
  sent_vectors = create_sentence_vectors(body_copy)
  x = np.concatenate([doc_vectors, sent_vectors], axis=1).tolist()
  x = pd.DataFrame(x)
  y = pd.DataFrame(doc_score)
  return x, y

In [0]:
# --- Import Stocastic Gradient Descent Regressor model --- 


from sklearn.linear_model import SGDRegressor

In [0]:
# --- Create a Gaussian Classifier --- 


Model = SGDRegressor()

In [0]:
# --- Suppress all warnings --- 


import warnings
warnings.filterwarnings("ignore")



# --- Creating a list of all file paths & partially fitting the model --- 

paths = glob.glob("/content/drive/My Drive/NLP Project/Project Final/Parsed_Papers/*.txt")
for i,path in enumerate(tqdm(paths[0:20])):
  x, y = create_data(path)
  Model.partial_fit(x,y)

<br>

Testing

In [0]:
# --- Dummy data --- 


t, b = create_data("/content/drive/My Drive/NLP Project/Parsed_Papers/S0003687013000549.txt")



In [0]:
# --- Predicting the top 4 summary sentences --- 

c = Model.predict(t)
lst = pd.Series(c)
i = lst.nlargest(4)
i = i.index.values.tolist()
i # Indices

[10, 157, 105, 130]

In [0]:
#  --- Predicted summary --- 

summary = []

for x in range(4):
    summary.append(body[i[x]])

summary

[' Sustained low-intensity muscle activity has been associated with adducted and extended wrist postures during keyboard intensive tasks (Dennerlein and Johnson, 2006)',
 '5, FDP=1, FDS=0N/cm2) (Brook etal',
 ' MCP adducted 3',
 ' The highest total muscle forces of three intrinsic muscles occurred in the adducted wrist posture']

In [0]:
# --- Highlights - which are gold standard summary of the paper --- 


text = read_paper("/content/drive/My Drive/NLP Project/Parsed_Papers/S0003687013000549.txt")
body_main,_, highlights,_ = process_paper(text)
highlights = " ".join(highlights.split()).split(".")
highlights

['We quantified the effect of four wrist postures during tapping on resulting finger and wrist muscle stress (including both active and passive component)',
 ' Neutral wrist posture was the optimal option among four tested wrist postures when all muscles were considered',
 ' Extensor muscles exhibited higher muscle stresses than flexors',
 ' Wrist extensors stress remained higher than 4',
 '5N/cm and wrist flexor stress remained below 0',
 '5N/cm during tapping',
 '']

<br>
<br>
<br>


Rough Work

In [0]:
f = open("/content/drive/My Drive/NLP Project/Parsed_Papers/S0003687013000549.txt", 'r', encoding="utf-8")
text = str(f.read())
f.close()

In [0]:
text = re.sub("@&#", " ", text)
text = re.sub("\n", " ", text)
text = (text.encode('ascii', 'ignore')).decode("utf-8")
text

" MAIN-TITLE Wrist posture affects hand and forearm muscle stress during tapping   HIGHLIGHTS                                                                                                                                                             We quantified the effect of four wrist postures during tapping on resulting finger and wrist muscle stress (including both active and passive component).                                                                                                            Neutral wrist posture was the optimal option among four tested wrist postures when all muscles were considered.                                                                                                            Extensor muscles exhibited higher muscle stresses than flexors.                                                                                                            Wrist extensors stress remained higher than 4.5N/cm and wrist flexor stress remained below 0.5N/cm 

In [0]:
main_title = re.findall(r'MAIN-TITLE(.*?)HIGHLIGHTS', text, flags = re.I)
main_title

[' Wrist posture affects hand and forearm muscle stress during tapping   ']

In [0]:
highlights = re.findall(r'HIGHLIGHTS(.*?)KEYPHRASES', text,  flags = re.I)[0]
highlights

'                                                                                                                                                             We quantified the effect of four wrist postures during tapping on resulting finger and wrist muscle stress (including both active and passive component).                                                                                                            Neutral wrist posture was the optimal option among four tested wrist postures when all muscles were considered.                                                                                                            Extensor muscles exhibited higher muscle stresses than flexors.                                                                                                            Wrist extensors stress remained higher than 4.5N/cm and wrist flexor stress remained below 0.5N/cm during tapping.                                                                             

In [0]:
keyphrases = re.findall(r'.*keyphrases(.*?)(?:introduction|abstract).*', text, flags = re.I)
keyphrases

['   Biomechanical model  Muscle stress  Optimization  Tapping   ']

In [0]:
body_main = re.findall(r'.*(?:abstract)(.*?)references', text, flags=re.I)[0]
body_main

"                                                      Non-neutral wrist posture is a risk factor of the musculoskeletal disorders among computer users. This study aimed to assess internal loads on hand and forearm musculature while tapping in different wrist postures. Ten healthy subjects tapped on a key switch using their index finger in four wrist postures: straight, ulnar deviated, flexed and extended. Torque at the finger and wrist joints were calculated from measured joint postures and fingertip force. Muscle stresses of the six finger muscles and four wrist muscles that balanced the calculated joint torques were estimated using a musculoskeletal model and optimization algorithm minimizing the squared sum of muscle stress. Non-neutral wrist postures resulted in greater muscle stresses than the neutral (straight) wrist posture, and the stress in the extensor muscles were greater than the flexors in all conditions. Wrist extensors stress remained higher than 4.5N/cm and wrist flexo

In [0]:
dummy_body = body_main.lower()
dummy_body = re.sub('[^\w\s\d\.]','',dummy_body)
dummy_body = ' '.join(dummy_body.split())
dummy_body = dummy_body.split(".")
dummy_body[:5]

['nonneutral wrist posture is a risk factor of the musculoskeletal disorders among computer users',
 ' this study aimed to assess internal loads on hand and forearm musculature while tapping in different wrist postures',
 ' ten healthy subjects tapped on a key switch using their index finger in four wrist postures straight ulnar deviated flexed and extended',
 ' torque at the finger and wrist joints were calculated from measured joint postures and fingertip force',
 ' muscle stresses of the six finger muscles and four wrist muscles that balanced the calculated joint torques were estimated using a musculoskeletal model and optimization algorithm minimizing the squared sum of muscle stress']

In [0]:
body = ' '.join(body_main.split())
body = body.split(".")
body[:5]

# for x in body:
#     if x == " ":
#         body.remove(x)


['Non-neutral wrist posture is a risk factor of the musculoskeletal disorders among computer users',
 ' This study aimed to assess internal loads on hand and forearm musculature while tapping in different wrist postures',
 ' Ten healthy subjects tapped on a key switch using their index finger in four wrist postures: straight, ulnar deviated, flexed and extended',
 ' Torque at the finger and wrist joints were calculated from measured joint postures and fingertip force',
 ' Muscle stresses of the six finger muscles and four wrist muscles that balanced the calculated joint torques were estimated using a musculoskeletal model and optimization algorithm minimizing the squared sum of muscle stress']

In [0]:
for i,x in enumerate(dummy_body):
  if (len(x.split())) < 3 or (len(x.split())) > 15: 
      dummy_body.pop(i)
      body.pop(i)
print(len(dummy_body),len(body))

205 205


In [0]:
dummy_highlights = highlights.lower()
dummy_highlights = re.sub('[^\w\s\d]','',dummy_highlights)
dummy_highlights = ' '.join(dummy_highlights.split())
dummy_highlights

'we quantified the effect of four wrist postures during tapping on resulting finger and wrist muscle stress including both active and passive component neutral wrist posture was the optimal option among four tested wrist postures when all muscles were considered extensor muscles exhibited higher muscle stresses than flexors wrist extensors stress remained higher than 45ncm and wrist flexor stress remained below 05ncm during tapping'

In [0]:
body_copy = []

for x in dummy_body:
    body_copy.append(stop_word_remove(x))
        
body_copy[:5]

['nonneutral wrist posture risk factor musculoskeletal disorders computer users',
 'healthy subjects tapped key switch index finger wrist postures straight ulnar deviated flexed extended',
 'muscle stresses finger muscles wrist muscles balanced calculated joint torques estimated musculoskeletal model optimization algorithm minimizing squared sum muscle stress',
 'wrist extensors stress remained higher 4',
 '5ncm wrist flexor stress remained 0']

In [0]:
highlight_copy = []

for x in dummy_highlights.split():
    highlight_copy.append(stop_word_remove(x))
    
highlight_copy = " ".join(sentence for sentence in highlight_copy)
highlight_copy = " ".join(highlight_copy.split())
highlight_copy

'quantified effect wrist postures tapping resulting finger wrist muscle stress including active passive component neutral wrist posture optimal option tested wrist postures muscles considered extensor muscles exhibited higher muscle stresses flexors wrist extensors stress remained higher 45ncm wrist flexor stress remained 05ncm tapping'

In [0]:
body_vectors = []


for sent in body_copy:
    sent_vec = []
    for word in sent.split():
        try:
            sent_vec.append(glove_model[word])
        except:
            sent_vec.append(glove_model["visual"])
    body_vectors.append(sent_vec)
    
    
body_vectors[0]

[array([ 2.2245e-01,  5.3615e-01, -7.1301e-01,  6.5633e-01, -5.3046e-02,
         3.9580e-01,  5.9489e-01, -1.1054e+00,  1.1245e-01,  1.3316e+00,
         6.5991e-01,  2.2238e-01,  6.3796e-01, -1.2998e-03, -3.7614e-01,
        -4.1864e-01, -5.9582e-01,  2.4691e-01, -3.0146e-01, -6.9246e-01,
         2.6544e-01,  7.7190e-01, -8.4292e-01, -4.6789e-01,  8.7583e-01,
        -1.1244e-01, -1.4159e+00, -1.8024e-01,  9.0787e-02, -6.8217e-01,
         2.7249e+00, -7.7912e-03,  7.1974e-01, -2.1604e+00, -2.6023e-01,
         1.0342e+00,  3.5889e-01, -1.9494e-01, -2.2193e-01, -1.9325e-01,
         8.0300e-01,  5.6181e-01, -2.0182e-01, -6.4084e-02, -3.8681e-01,
        -1.8003e-02,  1.3308e+00,  4.4911e-01,  1.5582e-01,  4.2052e-01],
       dtype=float32),
 array([-0.38893  , -0.56157  , -0.1763   ,  0.12833  , -0.25144  ,
         1.1199   ,  0.20211  ,  0.81796  ,  0.61214  , -0.22101  ,
         0.8987   ,  0.030855 , -1.6053   ,  1.1091   ,  0.44131  ,
         0.63059  , -1.3091   , -1.0743   

In [0]:
highlight_vectors = []


for word in highlight_copy.split():
    try:
        highlight_vectors.append(glove_model[word])
    except:
        highlight_vectors.append(glove_model["visual"])


highlight_vectors

In [0]:
from scipy.special import expit


doc_score = []

for sent in body_vectors:

    sent_score = 0

    for word in sent:
        for w in highlight_vectors:
            if (word == w).all():
                sent_score+=1
    try: 
      doc_score.append(expit(sent_score/len(sent)))
    except:
      doc_score.append(0)

      
y = doc_score
y[:10]

In [0]:
# Import libraries

from gensim.models import doc2vec
from collections import namedtuple

# Load data

doc1 = [body_main]

# Transform data (you can add more data preprocessing steps) 

docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, d in enumerate(doc1):
    words = d.lower().split()
    tags = [i]
    docs.append(analyzedDocument(words, tags))

# Train model (set min_count = 1, if you want the model to work with the provided example data set)

model = doc2vec.Doc2Vec(docs, size = 100, window = 300, min_count = 1, workers = 4)

# Get the vectors

doc_vec = model.docvecs[0]
doc_vec

In [0]:
doc2 = body_copy

# Transform data (you can add more data preprocessing steps) 

docu = []
analyzed = namedtuple('Analyzed', 'words tags')
for i, f in enumerate(doc2):
    wor = f.split()
    tags = [i]
    docu.append(analyzed(wor, tags))

# Train model (set min_count = 1, if you want the model to work with the provided example data set)

model = doc2vec.Doc2Vec(docu, size = 100, window = 300, min_count = 1, workers = 4)

# Get the vectors

sent_vectors = model.docvecs.vectors_docs