In [1]:
import re
import numpy as np
import pandas as pd
import os
os.chdir('h:/')
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

In [2]:
#iterate pdf
textdict={}
for i in os.listdir():
    if '.pdf' not in i:
        continue

    #convert pdf to text by iterating pages
    output_string = StringIO()
    with open(i, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    #convert
    textdict[i]=output_string.getvalue()
    f=open(i.replace('.pdf','.txt'),'w',encoding='utf_8_sig')
    f.write(textdict[i])
    f.close()
raw=textdict['sitelist.pdf']

In [3]:
#use regex to find coordinates
s1=re.findall("\d+°\d+'N \d+°\d+'W ",raw)
s2=re.findall("\d+°\d+'S \d+°\d+'W ",raw)
s3=re.findall("\d+°\d+'N \d+°\d+'E ",raw)
s4=re.findall("\d+°\d+'S \d+°\d+'E ",raw)
coordinates=s1+s2+s3+s4

In [4]:
#separate lat and lon
lat=[i.split(' ')[0] for i in coordinates]
lon=[i.split(' ')[1] for i in coordinates]

#get base degree
lat_base=[int(i.split('°')[0]) for i in lat]
lon_base=[int(i.split('°')[0]) for i in lon]

#get sign based on north south east west
lat_sign=[i[-1].replace('N','1').replace('S','-1') for i in lat]
lon_sign=[i[-1].replace('E','1').replace('W','-1') for i in lon]

#convert minute to decimal degree
lat_dec=[int(i.split('°')[-1][:2])/60 for i in lat]
lon_dec=[int(i.split('°')[-1][:2])/60 for i in lon]

In [5]:
#concatenate
lats=np.multiply(np.add(lat_base,lat_dec),np.array(lat_sign,dtype=float))
lons=np.multiply(np.add(lon_base,lon_dec),np.array(lon_sign,dtype=float))

In [6]:
#create df
df=pd.DataFrame(columns=['identifier',
        'latitude', 'longitude', 'feature', 'taille'])
df['identifier']=range(len(lats))
df['latitude']=lats
df['longitude']=lons
df['feature']='FEATURE_MARSH'
df['taille']=3

In [7]:
#shrink data size
df['lat']=df['latitude'].apply(lambda x:round(x))
df['lon']=df['longitude'].apply(lambda x:round(x))
df=df.sort_values(['latitude','longitude'])
df=df.loc[df[['lat','lon']].drop_duplicates().index]
del df['lat']
del df['lon']
df.to_csv('marsh.csv',index=False)