In [35]:
import re
import pdfplumber
import numpy as np
import pandas as pd
from collections import namedtuple
import textract as tx
import os

In [4]:
Line = (namedtuple('Line', 'Main_title Keys Values'))
main = re.compile(r'(\d) (.*)')
key_value = re.compile(r'(.*):[\n|\s](.*)')

In [52]:
lines = []
directory = './data'
for i, filename in enumerate(os.listdir(directory)):
        if filename.endswith(".pdf"):
            file = os.path.join(directory, filename)
            text = tx.process(file)
            text = text.decode('utf-8')
            text = text.replace('· ', '').replace('*  ', '').split('\n')

In [54]:
text

['Safety data sheet',
 'Page: 1/14',
 'BASF Safety data sheet according to UN GHS 4th rev.',
 'Date / Revised: 21.11.2017',
 'Product: Tinuvin® 1130',
 'Version: 3.0',
 '(ID no. 30080322/SDS_GEN_00/EN)',
 'Date of print 22.11.2017',
 '1. Identification',
 'Product identifier',
 'Tinuvin® 1130',
 'Chemical name: reaction mass of alpha-3-(3-(2H-benzotriazol-2- yl)-5-tert-butyl-4hydroxyphenyl)propionyl- omega-hydroxypoly(oxyethylene) and alpha-3-(3- (2H-benzotriazol-2-yl)-5tert-butyl-4-hydroxyphenyl)propionyl-omega-3-(3-(2H-benzotriazol-2-yl)-5-tert-butyl-4hydroxyphenyl)propionyloxypoly(oxyethylene)',
 'INDEX-Number: 607-176-00-3',
 'Relevant identified uses of the substance or mixture and uses advised against',
 'Relevant identified uses: stabilizer',
 'Details of the supplier of the safety data sheet',
 'Company:',
 'BASF SE',
 '67056 Ludwigshafen',
 'GERMANY',
 'Regional Business Unit Dispersions and',
 'Resins Europe',
 'Telephone: +49 621 60-90799',
 'E-mail address: ed-psr@basf.com'

In [53]:
while '' in text:
    text.remove('')

index = np.arange(0,len(text))
df_text = pd.DataFrame(text, index=[index], columns=['sentence'])
df_text

Unnamed: 0,sentence
0,Safety data sheet
1,Page: 1/14
2,BASF Safety data sheet according to UN GHS 4th...
3,Date / Revised: 21.11.2017
4,Product: Tinuvin® 1130
...,...
633,corresponding contractual quality of the subst...
634,responsibility of the recipient of the product...
635,legislation are observed.
636,Vertical lines in the left hand margin indicat...


In [42]:
def check_if_title(sentence):
    sentence = str(sentence)
    word_list = ['identification', 'information', 'measures', 'handling', 'properties', 'stability', 'considerations', 'exposure']
    
    starts_with_section = False
    starts_with_number = False
    if len(sentence.split()) > 1:
        starts_with_section = sentence.split()[0] == 'section'
        starts_with_number = sentence[0].isdigit() and not any(char.isdigit() for char in sentence[2:])
    
    
    contains_title_word = [ele for ele in word_list if(ele in sentence)] 
    
    if (starts_with_number or starts_with_section) and contains_title_word: 
        return 1
    else: 
        return 0

In [43]:
df_text["sentence"] = df_text["sentence"].str.lower()
df_text['title'] = df_text['sentence'].map(lambda a: check_if_title(a))

In [44]:
df_text

Unnamed: 0,sentence,title
0,safety data sheet,0
1,page: 1/14,0
2,basf safety data sheet according to un ghs 4th...,0
3,date / revised: 21.11.2017,0
4,product: tinuvin® 1130,0
...,...,...
633,corresponding contractual quality of the subst...,0
634,responsibility of the recipient of the product...,0
635,legislation are observed.,0
636,vertical lines in the left hand margin indicat...,0


In [45]:
def dict_titles_with_values(data):
    lst = {}
    current_title = ''
    for i,j in zip(data['title'], data['sentence']):
        if i == 1:
            current_title = j
            lst[j] = []
        else:
            if current_title != '':
                lst[current_title].append(j)
    return lst

In [48]:
dict_titles = dict_titles_with_values(df_text)

In [51]:
dataframe = pd.DataFrame({"Main_title": dict_titles.keys(), "Corpus": dict_titles.values()})
dataframe

Unnamed: 0,Main_title,Corpus
0,1. identification,"[product identifier, tinuvin® 1130, chemical n..."
1,2. hazards identification,"[classification of the substance or mixture, ..."
2,3. composition/information on ingredients,"[substances, chemical nature, light stabilizer..."
3,4. first-aid measures,"[description of first aid measures, immediatel..."
4,5. fire-fighting measures,"[extinguishing media, suitable extinguishing m..."
5,6. accidental release measures,"[personal precautions, protective equipment an..."
6,7. handling and storage,"[precautions for safe handling, no special mea..."
7,8. exposure controls/personal protection,"[control parameters, components with occupatio..."
8,9. physical and chemical properties,[information on basic physical and chemical pr...
9,10. stability and reactivity,"[reactivity, no hazardous reactions if stored ..."


In [None]:
            for line in text:
                if main.search(line):
                    num, Main_title = main.search(line).group(1), main.search(line).group(2)
                if key_value.search(line):
                    Keys,Values = key_value.search(line).group(1), key_value.search(line).group(2)
                    lines.append(Line(Main_title,Keys,Values))

In [14]:
#lines

In [25]:
df = pd.DataFrame(lines)

In [26]:
df

Unnamed: 0,Main_title,Keys,Values
0,Identification,Trade name,10N Sodium Hydroxide (NaOH 40%)
1,Identification,Product Number,NGT-10N NaOH
2,Laboratory chemicals,Application of the substance / the mixture,Laboratory chemicals
3,or 1-707-820-4080 for product information,PERS Emergency Response,"Domestic and Canada - 1-800-633-8253, Internat..."
4,H318 Causes serious eye damage.,Signal word,Danger
...,...,...,...
1564,02 08¤ other still bottoms and reaction residues,Transport hazard class(es),"9, EHSM"
1565,02 08¤ other still bottoms and reaction residues,Page,14/14
1566,02 08¤ other still bottoms and reaction residues,Date / Revised,21.11.2017
1567,02 08¤ other still bottoms and reaction residues,Product,Tinuvin® 1130


In [29]:
d = {k: f.groupby('Keys')['Values'].apply(list).to_dict()
     for k, f in df.groupby('Main_title')}

In [31]:
d

{'"Exposure Controls /': {'IDH number': ['701990'],
  'Product name': ['DB EPOXY Pot CMPD E-60NC 50ML']},
 '%': {'': ['> 94 %',
   '100 %',
   'Gas/vapour heavier than air at 20°C. Clear. Volatile.'],
  'Issued on': ['07/11/2011'],
  'Product code': ['051000'],
  'SECTION 10': ['Stability and reactivity'],
  'Violent to explosive reaction with (strong) oxidizers. Prolonged storage/in large quantities': ['may form peroxides.']},
 '% - < 3 %': {'CAS Number': ['84268-33-7'],
  'Content (W/W)': ['>= 1 % - < 3 %'],
  'EC-Number': ['400-820-2']},
 '% - < 50 % Skin Sens. 1A': {'CAS Number': ['104810-47-1'],
  'Content (W/W)': ['>= 25 % - < 50 % Skin Sens. 1A']},
 '% - < 75 % Skin Sens. 1A': {'\x0cPage': ['4/14'],
  'CAS Number': ['104810-48-2'],
  'Date / Revised': ['21.11.2017'],
  'Poly(oxy-1,2-ethanediyl), .alpha.-[3-[3-(2H-benzotriazol-2-yl)-5- (1,1-dimethylethyl)-4-hydroxyphenyl]1-oxopropyl]-.omega.-hydroxyContent (W/W)': ['>= 25 % - < 75 % Skin Sens. 1A'],
  'Product': ['Tinuvin® 1130']