In [1]:
import textract as tx
import pandas as pd
import os

In [2]:
def check_if_title(sentence):
    sentence = str(sentence)
    word_list = ['identification', 'composition', 'ingredients', 'information', 'measures', 'handling', 'consideration',
                 'properties', 'stability', 'considerations', 'exposure', 'section']
    
    starts_with_section = False
    starts_with_number = False
    splitted_sentence = sentence.split()
    if len(splitted_sentence) > 1:
        starts_with_section = sentence.split()[0] == 'section'
        starts_with_number = sentence[0].isdigit() and not any(char.isdigit() for char in sentence[4:])
        short_sentence = len(sentence) <= 50
    
    contains_title_word = any(word in splitted_sentence for word in word_list) 
    
    if (starts_with_number or starts_with_section) and contains_title_word and short_sentence: 
        return 1
    else:
        return 0

In [3]:
def df_from_text(path):
    text = tx.process(path)
    text = text.decode('utf-8')
    text_splitted = text.split("\r\n")
    
    d = {'sentence': text_splitted}
    df_text = pd.DataFrame(data=d)
    df_text['title'] = 0
    
    df_text["sentence"] = df_text["sentence"].str.lower()
    df_text['title'] = df_text['sentence'].map(lambda a: check_if_title(a))
    return df_text

In [4]:
# voorbeeld
path = './data/10N_Sodium_Hydroxide_NaOH_40_6_US_EN_sds.pdf'
df_text = df_from_text(path)

In [5]:
# select only titles
only_titles = df_text[df_text['title']==1]
only_titles

Unnamed: 0,sentence,title
12,1 identification,1
32,2 hazard(s) identification,1
135,3 composition/information on ingredients,1
154,4 first-aid measures,1
192,5 fire-fighting measures,1
203,6 accidental release measures,1
247,7 handling and storage,1
261,8 exposure controls/personal protection,1
322,9 physical and chemical properties,1
435,10 stability and reactivity,1


In [6]:
def make_pdf_dict(directory):
    pdf_dict = {}
    key_dict = {}
    for i, filename in enumerate(os.listdir(directory)):
        if filename.endswith(".pdf"):
            file = os.path.join(directory, filename)
            pdf_dict[i] = df_from_text(file)
            key_dict[i] = filename
        
        else:
            print("No PDF files found")
            
    return pdf_dict, key_dict

In [7]:
directory = r'./data'
pdf_dict, key_dict = make_pdf_dict(directory)

for dict_item in pdf_dict.items():
    df = dict_item[1]
    pdf_key = dict_item[0]
    pdf_title = key_dict[pdf_key]
    
    # select only titles
    only_titles = df[df['title']==1]
    
    print(f'PDF: {pdf_key}; title: {pdf_title}: titles: {only_titles.shape[0]}')
    display(only_titles)


PDF: 0; title: 10N_Sodium_Hydroxide_NaOH_40_6_US_EN_sds.pdf: titles: 16


Unnamed: 0,sentence,title
12,1 identification,1
32,2 hazard(s) identification,1
135,3 composition/information on ingredients,1
154,4 first-aid measures,1
192,5 fire-fighting measures,1
203,6 accidental release measures,1
247,7 handling and storage,1
261,8 exposure controls/personal protection,1
322,9 physical and chemical properties,1
435,10 stability and reactivity,1


PDF: 1; title: 1799516.pdf: titles: 27


Unnamed: 0,sentence,title
6,1. product and company identification,1
32,2. hazards identification,1
88,3. composition / information on ingredients,1
135,5. fire fighting measures,1
173,6. accidental release measures,1
188,7. handling and storage,1
264,9. physical and chemical properties,1
301,10. stability and reactivity,1
367,12. ecological information,1
372,13. disposal considerations,1


PDF: 2; title: 23114.pdf: titles: 16


Unnamed: 0,sentence,title
3,1. product and company identification,1
37,2. hazards identification,1
79,3. composition/information on ingredients,1
101,4. first aid measures,1
113,5. fire-fighting measures,1
161,6. accidental release measures,1
172,7. handling and storage,1
222,8. exposure controls/personal protection,1
260,9. physical and chemical properties,1
337,10. stability and reactivity,1


PDF: 3; title: 580076.pdf: titles: 18


Unnamed: 0,sentence,title
7,1. product and company identification,1
46,2. hazards identification,1
97,3. composition/information on ingredients,1
111,4. first aid measures,1
133,5. fire fighting measures,1
161,6. accidental release measures,1
189,7. handling and storage,1
195,8. exposure controls / personal protection,1
292,9. physical and chemical properties,1
329,10. stability and reactivity,1


PDF: 4; title: 586346.pdf: titles: 17


Unnamed: 0,sentence,title
9,section 1. product and company identification,1
45,section 2. hazards identification,1
120,section 3. composition/information on ingredients,1
137,section 4. first aid measures,1
187,section 5. fire-fighting measures,1
254,section 6. accidental release measures,1
300,section 7. handling and storage,1
320,section 8. exposure controls/personal protection,1
387,section 9. physical and chemical properties,1
466,section 10. stability and reactivity,1


PDF: 5; title: 765-12-8_Sigma-Aldrich.pdf: titles: 16


Unnamed: 0,sentence,title
9,1. product and company identification,1
73,2. hazards identification,1
88,3. composition/information on ingredients,1
112,4. first aid measures,1
141,5. firefighting measures,1
163,6. accidental release measures,1
185,7. handling and storage,1
204,8. exposure controls/personal protection,1
239,9. physical and chemical properties,1
376,10. stability and reactivity,1


PDF: 6; title: Carbowax Sentry PEG 400 NF - Dow - 2015-03-26.pdf: titles: 17


Unnamed: 0,sentence,title
13,1. identification,1
45,2. hazards identification,1
63,3. composition/information on ingredients,1
78,4. first aid measures,1
91,section 11: toxicology information.,1
97,5. firefighting measures,1
129,6. accidental release measures,1
138,7. handling and storage,1
155,8. exposure controls/personal protection,1
205,9. physical and chemical properties,1


PDF: 7; title: Dubei Steriling Alcohol Wipes SDS.pdf: titles: 16


Unnamed: 0,sentence,title
54,1 identification,1
70,2 hazard(s) identification,1
137,3 composition/information on ingredients,1
197,4 first aid measures,1
218,5 fire-fighting measures,1
225,6 accidental release measures,1
237,7 handling and storage,1
252,8 exposure controls/personal protection,1
311,9 physical and chemical properties,1
430,10 stability and reactivity,1


PDF: 8; title: gylcol-ether-dpm-sds.pdf: titles: 17


Unnamed: 0,sentence,title
9,1. identification,1
22,2. hazards identification,1
51,3. composition,1
68,4. first aid measures,1
87,5. fire fighting measures,1
110,6. accidental release measures,1
125,7. handling and storage,1
138,8. exposure controls and personal proection,1
193,9. physical and chemical properties,1
255,10. stability and reactivity,1


PDF: 9; title: Hydrogen_Peroxide_35_7_US_EN_sds.pdf: titles: 16


Unnamed: 0,sentence,title
12,1 identification,1
29,2 hazard(s) identification,1
162,3 composition/information on ingredients,1
203,4 first-aid measures,1
225,5 fire-fighting measures,1
236,6 accidental release measures,1
283,7 handling and storage,1
298,8 exposure controls/personal protection,1
361,9 physical and chemical properties,1
469,10 stability and reactivity,1


PDF: 10; title: LC15750.pdf: titles: 17


Unnamed: 0,sentence,title
11,section 1: identification,1
75,section 2: hazard(s) identification,1
158,section 3: composition/information on ingredients,1
194,section 4: first-aid measures,1
283,section 5: fire-fighting measures,1
328,section 6: accidental release measures,1
395,section 7: handling and storage,1
466,section 8: exposure controls/personal protection,1
542,section 9: physical and chemical properties,1
702,section 10: stability and reactivity,1


PDF: 11; title: NN-dimethylformamide_3_US_EN_sds.pdf: titles: 16


Unnamed: 0,sentence,title
12,1 identification,1
36,2 hazard(s) identification,1
203,3 composition/information on ingredients,1
212,4 first-aid measures,1
233,5 fire-fighting measures,1
263,6 accidental release measures,1
282,7 handling and storage,1
314,8 exposure controls/personal protection,1
383,9 physical and chemical properties,1
483,10 stability and reactivity,1


PDF: 12; title: Omnirad-184.pdf: titles: 16


Unnamed: 0,sentence,title
10,1. identification,1
51,2. hazard(s) identification,1
77,3. composition/information on ingredients,1
99,4. first-aid measures,1
165,5. fire-fighting measures,1
215,6. accidental release measures,1
249,7. handling and storage,1
298,8. exposure controls/personal protection,1
357,9. physical and chemical properties,1
460,10. stability and reactivity,1


PDF: 13; title: PHTHALIC-ANHYDRIDE--ACS-5KG-pdf.pdf: titles: 16


Unnamed: 0,sentence,title
7,1. identification,1
43,2. hazard(s) identification,1
113,3. composition/information on ingredients,1
123,4. first-aid measures,1
161,5. fire-fighting measures,1
210,6. accidental release measures,1
221,7. handling and storage,1
240,8. exposure controls / personal protection,1
294,9. physical and chemical properties,1
350,10. stability and reactivity,1


PDF: 14; title: quilon_h-eu_msds__1594909058.pdf: titles: 0


Unnamed: 0,sentence,title


PDF: 15; title: Savinyl-Orange-RLS.pdf: titles: 22


Unnamed: 0,sentence,title
37,section 2: hazards identification,1
47,section 3: composition/information on ingredients,1
64,section 4: first aid measures,1
65,4.1. description of first aid measures,1
85,section 5: firefighting measures,1
115,section 6: accidental release measures,1
129,section 7: handling and storage,1
130,7.1. precautions for safe handling,1
147,section 8: exposure controls/personal protection,1
175,8.2. exposure controls,1


PDF: 16; title: SDS - Horizon Blue Clarifier.pdf: titles: 15


Unnamed: 0,sentence,title
2,1. identification,1
36,2. hazard(s) identification,1
90,3. composition/information on ingredients,1
107,4. first-aid measures,1
147,5. fire-fighting measures,1
180,6. accidental release measures,1
201,7. handling and storage,1
212,8. exposure controls/personal protection,1
255,9. physical and chemical properties,1
368,10. stability and reactivity,1


PDF: 17; title: Sledgehammer-Heat-Cure-Powder-SDS-US-20150522.pdf: titles: 28


Unnamed: 0,sentence,title
3,section 1. identification,1
38,section 2. hazards identification,1
106,section 2. hazards identification,1
114,section 3. composition/information on ingredients,1
152,section 4. first aid measures,1
211,section 4. first aid measures,1
256,section 5. fire-fighting measures,1
306,section 6. accidental release measures,1
340,section 13 for waste disposal.,1
342,section 7. handling and storage,1


PDF: 18; title: Trifluoroacetic_Acid_TFA_7_US_EN_sds.pdf: titles: 16


Unnamed: 0,sentence,title
12,1 identification,1
38,2 hazard(s) identification,1
180,3 composition/information on ingredients,1
189,4 first-aid measures,1
213,5 fire-fighting measures,1
241,6 accidental release measures,1
266,7 handling and storage,1
280,8 exposure controls/personal protection,1
343,9 physical and chemical properties,1
448,10 stability and reactivity,1


PDF: 19; title: 巴斯夫紫外线吸收剂-tinuvin-1130-msds宝旭化工.pdf: titles: 16


Unnamed: 0,sentence,title
10,1. identification,1
34,2. hazards identification,1
108,3. composition/information on ingredients,1
153,4. first-aid measures,1
174,5. fire-fighting measures,1
203,6. accidental release measures,1
216,7. handling and storage,1
229,8. exposure controls/personal protection,1
272,9. physical and chemical properties,1
402,10. stability and reactivity,1


In [8]:
df = pdf_dict[7]
only_titles = df[df['title']==1]
len(only_titles.sentence.iloc[8].split())

5

In [9]:
len("3 composition/information on ingredients")

40