In [95]:
import pdfplumber as pdf
import pandas as pd

In [104]:
pdf_tw = pdf.open("./pdf/tw.pdf")
pdf_eu = pdf.open("./pdf/35to129_eu.pdf")

### Taiwan


In [114]:
tw_pdf = []
with pdf_tw as pdf:
    pages = pdf.pages
    for page in pages:
        page = page.extract_table()
        for table in page:
            tw_pdf.append(table)

tw_pdf = pd.DataFrame(data=tw_pdf[1:-1], columns=tw_pdf[0]).drop(
    ["No.", "CAS No."], axis=1
)
tw_pdf = tw_pdf.drop_duplicates(keep=False)
tw_pdf = tw_pdf.reset_index(drop=True)
tw_pdf.replace(to_replace="\n", value=" ", regex=True, inplace=True)
tw_pdf.rename(columns={"Chemical name": "Chemical Name"}, inplace=True)
tw_pdf

Unnamed: 0,Chemical Name
0,Mercury and its compounds (with the exception ...
1,4-Benzyloxyphenol and 4-ethoxyphenol
2,Bithionol
3,Pilocarpine and its salts
4,Halogeno-salicylanilide
...,...
646,Raw material made from bovine and sheep tissue...
647,Alanroot oil（Inula helenium L.）
648,"Rauwolfia serpentina L., alkaloids and their s..."
649,Yohimbine and its salts


In [115]:
print(tw_pdf)

                                         Chemical Name
0    Mercury and its compounds (with the exception ...
1                 4-Benzyloxyphenol and 4-ethoxyphenol
2                                            Bithionol
3                            Pilocarpine and its salts
4                              Halogeno-salicylanilide
..                                                 ...
646  Raw material made from bovine and sheep tissue...
647                    Alanroot oil（Inula helenium L.）
648  Rauwolfia serpentina L., alkaloids and their s...
649                            Yohimbine and its salts
650                     Tripterygium wilfordii Hook.f.

[651 rows x 1 columns]


### EU


In [117]:
eu_pdf = []
with pdf_eu as pdf:
    pages = pdf.pages
    for page in pages:
        page = page.extract_table()
        for table in page:
            eu_pdf.append(table)
eu_pdf = pd.DataFrame(data=eu_pdf[1:], columns=eu_pdf[0]).drop(["CAS number"], axis=1)
eu_pdf = eu_pdf.drop_duplicates(keep=False).reset_index(drop=True)

In [118]:
eu_pdf.replace(to_replace="\n", value=" ", regex=True, inplace=True)
eu_pdf.rename(columns={"Chemical name/INN": "Chemical Name"}, inplace=True)
eu_pdf

Unnamed: 0,Chemical Name
0,N-(5-Chlorobenzoxazol-2-yl)acetamide
1,(2-Acetoxyethyl)trimethylammonium hydroxide (A...
2,Deanol aceglumate (INN)
3,Spironolactone (INN)
4,"[4-(4-Hydroxy-3-iodophenoxy)-3,5-diiodophenyl]..."
...,...
1576,Reaction products of paraformaldehyde with 2- ...
1577,Methylhydrazine
1578,"Triadimenol (ISO); (1RS,2RS;1RS,2SR)-1-(4-chlo..."
1579,Thiacloprid (ISO); (Z)-3-(6-chloro-3-pyridyl-m...


### Comparison


In [136]:
comparison = pd.merge(tw_pdf, eu_pdf, how="outer", indicator="Exist")
comparison.replace(to_replace="left_only", value="Taiwan", inplace=True)
comparison.replace(to_replace="right_only", value="EU", inplace=True)
comparison.replace(to_replace="both", value="Both", inplace=True)
comparison

Unnamed: 0,Chemical Name,Exist
0,Mercury and its compounds (with the exception ...,Taiwan
1,4-Benzyloxyphenol and 4-ethoxyphenol,Both
2,Bithionol,Taiwan
3,Pilocarpine and its salts,Both
4,Halogeno-salicylanilide,Taiwan
...,...,...
1899,Reaction products of paraformaldehyde with 2- ...,EU
1900,Methylhydrazine,EU
1901,"Triadimenol (ISO); (1RS,2RS;1RS,2SR)-1-(4-chlo...",EU
1902,Thiacloprid (ISO); (Z)-3-(6-chloro-3-pyridyl-m...,EU


Just double checking if any error (NaNs):


In [129]:
comparison_nan = comparison[comparison["Exist"].isna()]

0

### Filter EU only, Taiwan only and both as 3 dataframes


In [137]:
comp_eu = comparison.query("Exist == 'EU'")
comp_eu

Unnamed: 0,Chemical Name,Exist
651,N-(5-Chlorobenzoxazol-2-yl)acetamide,EU
652,(2-Acetoxyethyl)trimethylammonium hydroxide (A...,EU
653,Aminocaproic acid (INN) and its salts,EU
654,Aconitine (principal alkaloid of Aconitum nape...,EU
655,"Alkyne alcohols, their esters, ethers and salts",EU
...,...,...
1899,Reaction products of paraformaldehyde with 2- ...,EU
1900,Methylhydrazine,EU
1901,"Triadimenol (ISO); (1RS,2RS;1RS,2SR)-1-(4-chlo...",EU
1902,Thiacloprid (ISO); (Z)-3-(6-chloro-3-pyridyl-m...,EU


In [138]:
comp_tw = comparison.query("Exist == 'Taiwan'")
comp_tw

Unnamed: 0,Chemical Name,Exist
0,Mercury and its compounds (with the exception ...,Taiwan
2,Bithionol,Taiwan
4,Halogeno-salicylanilide,Taiwan
5,Boric acid,Taiwan
6,Sodium perborate,Taiwan
...,...,...
644,3-[[4-[(2-Hydroxyethyl)amino]-2-nitrophenyl]am...,Taiwan
645,"Ethanaminium, N-[4-[[4-(diethylamino)phenyl][4...",Taiwan
646,Raw material made from bovine and sheep tissue...,Taiwan
647,Alanroot oil（Inula helenium L.）,Taiwan


In [139]:
comp_both = comparison.query("Exist == 'Both'")
comp_both

Unnamed: 0,Chemical Name,Exist
1,4-Benzyloxyphenol and 4-ethoxyphenol,Both
3,Pilocarpine and its salts,Both
10,"Cells, tissues or products of human origin",Both
15,Vinyl chloride monomer,Both
16,"2,2'-Dihydroxy-3,3',5,5',6,6'-hexachlorodiphen...",Both
...,...,...
569,Thallium and its compounds,Both
570,Neodymium and its salts,Both
571,Tellurium and its compounds,Both
648,"Rauwolfia serpentina L., alkaloids and their s...",Both


### Export to excel


In [140]:
with pd.ExcelWriter("./result/25oct_compariso.xlsx") as writer:
    comparison.to_excel(writer, sheet_name="Full Comparison")
    comp_eu.to_excel(writer, sheet_name="Only in EU")
    comp_tw.to_excel(writer, sheet_name="Only in Taiwan")
    comp_both.to_excel(writer, sheet_name="Exist in Both")