In [1]:
import pdfplumber as pdf
import pandas as pd
import numpy as np
from thefuzz import fuzz

In [2]:
pdf_tw = pdf.open("./pdf/tw.pdf")
pdf_eu = pdf.open("./pdf/35to129_eu.pdf")

### Taiwan


In [62]:
tw_pdf = []
with pdf_tw as pdf:
    pages = pdf.pages
    for page in pages:
        page = page.extract_table()
        for table in page:
            tw_pdf.append(table)

tw_pdf = pd.DataFrame(data=tw_pdf[1:-1], columns=tw_pdf[0]).drop(["No."], axis=1)
tw_pdf = tw_pdf.drop_duplicates()
tw_pdf = tw_pdf.reset_index(drop=True)

tw_pdf.replace(to_replace="-", value="NO CAS", regex=False, inplace=True)
tw_pdf.rename(columns={"Chemical name": "Chemical Name"}, inplace=True)

tw_pdf = tw_pdf.assign(**{"CAS No.": tw_pdf["CAS No."].str.split("\n")})

# Explode the "CAS No." column to create separate rows for each CAS number
tw_pdf = tw_pdf.explode("CAS No.")

# Remove leading/trailing whitespace from the "CAS No." column
tw_pdf["CAS No."] = tw_pdf["CAS No."].str.strip()

# Drop duplicates
tw_pdf = tw_pdf.drop_duplicates()

tw_pdf["CAS No."] = tw_pdf["CAS No."].replace(to_replace="/", value="", regex=True)

# Reset the index
tw_pdf = tw_pdf.reset_index(drop=True)

tw_pdf

Unnamed: 0,Chemical Name,CAS No.
0,Mercury and its compounds (with the exception ...,7439-97-6
1,4-Benzyloxyphenol and 4-ethoxyphenol,103-16-2
2,4-Benzyloxyphenol and 4-ethoxyphenol,622-62-8
3,Bithionol,97-18-7
4,Pilocarpine and its salts,92-13-7
...,...,...
733,Raw material made from bovine and sheep tissue...,NO CAS
734,Alanroot oil（Inula helenium L.）,97676-35-2
735,"Rauwolfia serpentina L., alkaloids and their s...",90106-13-1
736,Yohimbine and its salts,146-48-5


### EU


In [63]:
eu_pdf = []
with pdf_eu as pdf:
    pages = pdf.pages
    for page in pages:
        page = page.extract_table()
        for table in page:
            eu_pdf.append(table)
eu_pdf = pd.DataFrame(data=eu_pdf[1:], columns=eu_pdf[0])
eu_pdf = eu_pdf.drop_duplicates().reset_index(drop=True)

In [64]:
eu_pdf.rename(
    columns={"Chemical name/INN": "Chemical Name", "CAS number": "CAS No."},
    inplace=True,
)

on9space = eu_pdf["CAS No."].iloc[15]

eu_pdf["CAS No."] = eu_pdf["CAS No."].replace(
    to_replace=on9space, value="NO CAS", regex=False
)

eu_pdf = eu_pdf.assign(**{"CAS No.": eu_pdf["CAS No."].str.split("\n")})

# Explode the "CAS No." column to create separate rows for each CAS number
eu_pdf = eu_pdf.explode("CAS No.")

# Remove leading/trailing whitespace from the "CAS No." column
eu_pdf["CAS No."] = eu_pdf["CAS No."].str.strip()

# Drop duplicates
eu_pdf = eu_pdf.drop_duplicates()

eu_pdf["CAS No."] = eu_pdf["CAS No."].replace(to_replace="/", value="", regex=True)
eu_pdf.replace(to_replace="—", value="NO CAS", regex=False, inplace=True)
eu_pdf.replace(to_replace="\n", value=" ", regex=True, inplace=True)
eu_pdf["CAS No."] = eu_pdf["CAS No."].replace("\\[[^\\]]*\\]", "", regex=True)

another7space = eu_pdf["CAS No."].iloc[1868]
eu_pdf["CAS No."] = eu_pdf["CAS No."].replace(another7space, "NO CAS", regex=True)

# Reset the index
eu_pdf = eu_pdf.reset_index(drop=True)

eu_pdf.tail(50)

Unnamed: 0,Chemical Name,CAS No.
1916,Aclonifen (ISO); 2-chloro-6-nitro-3-phenoxyani...,74070-46-5
1917,"2-Ethylhexyl 10-ethyl-4,4-dimethyl-7-oxo-8-oxa...",57583-35-4
1918,Dimethyltin dichloride,753-73-1
1919,4-Vinylcyclohexene,100-40-3
1920,Tralkoxydim (ISO); 2-(N-ethoxypropanimidoyl)-3...,87820-88-0
1921,Cycloxydim (ISO); 2-(N-ethoxybutanimidoyl)-3- ...,101205-02-1
1922,"Fluazinam (ISO); 3-chloro-N-[3-chloro-2,6-dini...",79622-59-6
1923,"Penconazole (ISO); 1-[2-(2,4-dichlorophenyl)pe...",66246-88-6
1924,Fenoxycarb (ISO); ethyl [2-(4-phenoxyphen­ oxy...,72490-01-8
1925,Styrene,100-42-5


### Comparison


on CAS no.


In [65]:
cas_comp = tw_pdf.merge(
    eu_pdf,
    on="CAS No.",
    how="outer",
    indicator="CAS Exist",
    suffixes=("(Taiwan)", "(EU)"),
)
cas_comp.replace(to_replace="left_only", value="Taiwan", inplace=True)
cas_comp.replace(to_replace="right_only", value="EU", inplace=True)
cas_comp.replace(to_replace="both", value="Both", inplace=True)
cas_comp

Unnamed: 0,Chemical Name(Taiwan),CAS No.,Chemical Name(EU),CAS Exist
0,Mercury and its compounds (with the exception ...,7439-97-6,"Mercury and its compounds, except those specia...",Both
1,4-Benzyloxyphenol and 4-ethoxyphenol,103-16-2,4-Benzyloxyphenol and 4-ethoxyphenol,Both
2,4-Benzyloxyphenol and 4-ethoxyphenol,622-62-8,4-Benzyloxyphenol and 4-ethoxyphenol,Both
3,Bithionol,97-18-7,Bithionol (INN),Both
4,Pilocarpine and its salts,92-13-7,Pilocarpine and its salts,Both
...,...,...,...,...
3038,,3830-45-3,Nonadecafluorodecanoic acid [1] Ammonium nonad...,EU
3039,,5625-90-1,"N,N′-Methylenedimorpholine; N,N′-methylenebism...",EU
3040,,55219-65-3,"Triadimenol (ISO); (1RS,2RS;1RS,2SR)-1-(4-chlo...",EU
3041,,111988-49-9,Thiacloprid (ISO); (Z)-3-(6-chloro-3-pyridyl-m...,EU


Query to delete all with CAS


In [109]:
diffs = cas_comp[cas_comp["CAS Exist"] != "Both"]
diffs = diffs.drop(["CAS No."], axis=1)
diffs.reset_index(drop=True, inplace=True)
diffs

Unnamed: 0,Chemical Name(Taiwan),Chemical Name(EU),CAS Exist
0,Sodium borate\nexcept for sodium borate used t...,,Taiwan
1,Hydroxy-8-quinoline and its sulphate\nFor use ...,,Taiwan
2,6-Methylcoumarin (Non-medical toothpaste and m...,,Taiwan
3,Chemical name,,Taiwan
4,Dichlorophen,,Taiwan
...,...,...,...
1360,,Nonadecafluorodecanoic acid [1] Ammonium nonad...,EU
1361,,"N,N′-Methylenedimorpholine; N,N′-methylenebism...",EU
1362,,"Triadimenol (ISO); (1RS,2RS;1RS,2SR)-1-(4-chlo...",EU
1363,,Thiacloprid (ISO); (Z)-3-(6-chloro-3-pyridyl-m...,EU


EU filtered


In [123]:
# Replace words settings
replace = ["[INN]", "(INN)", "[INCI]", "(INCI)"]

In [167]:
from_list = diffs[diffs["Chemical Name(EU)"].notnull()]
from_list = (
    from_list.drop(["Chemical Name(Taiwan)"], axis=1)
    .sort_values("Chemical Name(EU)")
    .reset_index(drop=True)
)
weridspace = from_list["Chemical Name(EU)"].iloc[0]

from_list["Chemical Name(EU)"] = from_list["Chemical Name(EU)"].replace(
    weridspace, "###", regex=False
)

for i in replace:
    from_list["Chemical Name(EU)"] = from_list["Chemical Name(EU)"].str.replace(i, "")

from_list = (
    from_list[from_list["Chemical Name(EU)"] != "###"]
    .drop_duplicates()
    .reset_index(drop=True)
)
from_list

Unnamed: 0,Chemical Name(EU),CAS Exist
0,"(+/–)-2-(2,4-Dichlorophenyl)-3-(1H-1,2,4-triaz...",EU
1,(+/–)-Tetrahydrofurfuryl –(R)-2-[4-(6-chloroqu...,EU
2,"(1R,4S,5R,8S)-1,2,3,4,10,10-Hexachloro-1,4,4a,...",EU
3,(2-Acetoxyethyl)trimethylammonium hydroxide (A...,EU
4,(2-Chloroethyl)(3-hydroxypropyl)ammonium chloride,EU
...,...,...
1011,"trans-2-Hexenal diethyl acetal, when used as a...",EU
1012,"trans-2-Hexenal dimethyl acetal, when used as ...",EU
1013,trans-4-Phenyl-L-proline,EU
1014,tris(2-Chloroethyl) phosphate,EU


TW filtered


In [165]:
to_list = diffs[diffs["Chemical Name(Taiwan)"].notnull()]

to_list = (
    to_list.drop(["Chemical Name(EU)"], axis=1)
    .sort_values("Chemical Name(Taiwan)")
    .drop_duplicates()
    .reset_index(drop=True)
)

for i in replace:
    to_list["Chemical Name(Taiwan)"] = to_list["Chemical Name(Taiwan)"].str.replace(
        i, ""
    )

to_list

Unnamed: 0,Chemical Name(Taiwan),CAS Exist
0,"1,2,4-Trichlorobenzene",Taiwan
1,"1,2-Benzenedicarboxylic acid, dipentylester, b...",Taiwan
2,"1,2-Dichloropropane",Taiwan
3,"1,2-Diphenylhydrazine",Taiwan
4,"1,3-Dichlorobenzene",Taiwan
...,...,...
85,p-Chloro-o-toluidine,Taiwan
86,p-Hydroxyanisole (4-Methoxyphenol；Hydroquinone...,Taiwan
87,p-Phenylenediamine\nexcept for use in hair dye...,Taiwan
88,α-Bromobenzyl cyanide,Taiwan


In [168]:
from_list = from_list["Chemical Name(EU)"].tolist()
to_list = to_list["Chemical Name(Taiwan)"].tolist()

In [173]:
matched_indices = []
tw_match = []
eu_match = []
# Iterate over each row in the from_list
for i, row1 in enumerate(from_list):
    # If the row has already been matched, skip it
    if i in matched_indices:
        continue
    # Define variables to store the best match and its score
    best_match = None
    best_score = 88
    # Iterate over each row in the to_list
    for j, row2 in enumerate(to_list):
        # If the row has already been matched, skip it
        if j in matched_indices:
            continue
        # Calculate the similarity score between the two rows
        score = fuzz.token_set_ratio(row1, row2)
        # If the score is above a certain threshold and better than the current best score, update the best match
        if score > best_score:
            best_match = j
            best_score = score
    # If a match was found, add the indices to the matched_indices list
    if best_match is not None:
        matched_indices.append(i)
        matched_indices.append(best_match)
        print(f"Match found: {row1} -> {to_list[best_match]}")
        tw_match.append(to_list[best_match])
        eu_match.append(row1)

Match found: 1,2-Benzenedicarboxylic acid, di-C , branched and 7-11 linear alkyl esters -> 1,2-Benzenedicarboxylic acid, dipentylester, branched and linear; n-Pentyl-isopentylphthalate; di-n-Pentyl phthalate; Diisopentylphthalate
Due to the raw material or other technically unavoidable reasons, the trace amounts of group of phthalate acid esters shall not exceed 100 ppm.
Match found: 1-and 2-Naphthylamines and their salts -> 1-and 2-Naphthylamines and their salts
Match found: 2,3-Dichloropropene -> 1,2-Dichloropropane
Match found: 2,3-Dinitrotoluene -> 2,4-Dinitrotoluene; Dinitrotoluene, technical grade
Match found: 2,4,5-Trimethylaniline [1] 2,4,5-Trimethylaniline hydrochloride [2] -> 2,4,5-Trimethylaniline/
2,4,5-Trimethylaniline hydrochloride
Match found: 2-chlorobenzene-1,4-diamine (2-Chloro-p-Phenylenedia­ mine), its sulfate and dihydrochloride salts (9 ) when used as a substance in hair dye products, including eyebrow dye products, and eyelash dye products -> Chlorobenzene
Match 

Remove from lists


In [180]:
tw_list = to_list
eu_list = from_list

for i in tw_match:
    if i in tw_list:
        tw_list.remove(i)

for i in eu_match:
    if i in eu_list:
        eu_list.remove(i)

print(len(tw_list), len(eu_list))


# Count the number of entries left
tw_list = pd.Series(tw_list)
eu_list = pd.Series(eu_list)

70 996


### Export to excel


In [182]:
with pd.ExcelWriter("./result/30oct.xlsx") as writer:
    tw_list.to_excel(writer, sheet_name="Exists in TW only", index=False)
    eu_list.to_excel(writer, sheet_name="Exists in EU only", index=False)