In [86]:
'''
Purpose:
Extract parameters about all MODSIM papers published from 2014 to 2018
Transform parameters within a dataframe
Load the text of the MODSIM papers into a working data set

Inputs:
The inputs for this procedure include:
* A list of MODSIM websites containing the papers for parsing HTML with BeautifulSoup
* Text files containing full text from each MODSIM Papers (2014 to 2018)
  ** Downloaded PDF from MODSIM website using Google Chrono Sniffer extension
  ** Converted to text using Mac OS Automator workflow
  ** Assumes a file structure ./data/<year>/ exists for years 2014-2018

Output:
The results of this module is a folder called ./data/abstracts/ containing .txt files each with a custom label and containing abstracts extracted from MODSIM papers.
'''

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

# Provide list of all sites containing MODSIM Papers 2014-2018
sites = ['http://modsimworld.org/conference-papers/2014',
         'http://modsimworld.org/conference-papers/2015',
         'http://modsimworld.org/conference-papers/2016',
         'http://modsimworld.org/conference-papers/2017',
         'http://modsimworld.org/conference-papers/2018']
# Create an empty list to capture webscrape results 
records = [] 

# Extract: get parameters of papers into a dataframe
for site in sites:
    page = requests.get(site)
    soup = BeautifulSoup(page.text, 'html.parser')
    results = soup.find_all('table') # finds all tables on page, each is a track

    for result in results:
        for r in result.findAll('tr'):
            if r.find('td') is not None:
                track = result.find('th').text
                filename = r.find('a')['href'][13:]
                author = r.find('td').text
                title = r.find('span').text
                year = r.find('a')['href'][8:12]
                records.append((track,filename,author,title,year))

df = pd.DataFrame(records, columns = ['track', 'filename', 'author_id', 'title', 'year'])

# Transform: relabel tracks into one of four categories
df['year'] = df['year'].apply(str)
mapped_sub = {'track': {'Training and Education': 'TE',
                        'Training': 'TE',
                        'Eduction': 'TE',
                        'Analytics and Decision Making': 'AT',
                        'Analytics and Decision-Making': 'AT',
                        'Science and Engineering': 'SE',
                        'Visualization and Gamification': 'VG',
                        'Entertainment, Sports, Media, & Visualization': 'VG',
                        'Cyber Security': 'VG'}
             }
df_mod = df.replace(to_replace=mapped_sub)

# Transform: convert .pdf filenames into .txt
df_mod['filename'] = df_mod['filename'].str.replace('pdf', 'txt')

# Transform: extract first occurence of last name as author tag
# create three individual series for three types of matches
a=df_mod['author_id'].str.extract(r'([a-zA-Z]+,)')
b=df_mod['author_id'].str.extract(r'([a-zA-Z]+ and)')
c=df_mod['author_id'].str.extract(r'([a-zA-Z]+$)')

# combine the three temp series with precedence for already filled rows
temp = a.combine_first(b).combine_first(c)
# remove the separator flags 
temp[0] = temp[0].str.replace(',','') # a series function
temp[0] = temp[0].str.replace('and','')
df_mod['author_id'] = temp[0]

# Create doc labels in track-year-author format  
df_mod['label'] = './data/abstracts/' + df_mod['track'] + '-' + \
                df_mod['year'] + '-' + df_mod['author_id'] + '.txt'
df_mod




Unnamed: 0,track,filename,author_id,title,year,label
0,AT,MS1401-Pratical HSI Methods for Medical Simula...,Bockelman,Practical Human-Systems Integration Methods fo...,2014,./data/abstracts/AT-2014-Bockelman.txt
1,AT,MS1480-Decision Making MODSIM 2014 Paper.txt,Hase,Evidence Based Decision Making: Techniques for...,2014,./data/abstracts/AT-2014-Hase.txt
2,AT,MODSIM2014_MIST_INSPYRED_Paper_Submit_2014_03_...,Barhak,Population Generation from Statistics Using Ge...,2014,./data/abstracts/AT-2014-Barhak .txt
3,AT,SCDT_MODSIM Paper.txt,Allen,Discrete Event Simulation for Supporting Produ...,2014,./data/abstracts/AT-2014-Allen.txt
4,AT,MS1470_MODSIM_2014_Massed Exposure Improves Re...,Lackey,Massed Exposure Improves Response Time for Det...,2014,./data/abstracts/AT-2014-Lackey.txt
5,AT,2013 Virginia Homeland Security Portfolio Valu...,Ezell,2013 Virginia Homeland Security Portfolio Valu...,2014,./data/abstracts/AT-2014-Ezell .txt
6,AT,MODSIM2014_LVCGAED.txt,Brun,The Assisted Experimental Designer: A Decision...,2014,./data/abstracts/AT-2014-Brun.txt
7,AT,SpecializedTankInspectionandOverhaul.txt,Drucker,An Adaptive Planning Tool For Air Craft Carrie...,2014,./data/abstracts/AT-2014-Drucker .txt
8,AT,Equal but Different_20140312_SJL.txt,Lackey,Equal but Different: 5 Research Strategies for...,2014,./data/abstracts/AT-2014-Lackey.txt
9,AT,Final Energy Resilience MODSIM 2014 Paper_14 M...,Delaney,Resilience: Modeling for Conditions of Uncerta...,2014,./data/abstracts/AT-2014-Delaney.txt
