In [49]:
import pandas as pd
import re
import os
from dotenv import load_dotenv

In [50]:
load_dotenv()

True

In [51]:
def clean_text(text):
    if isinstance(text, str):
        text = text.replace('“', '"').replace('”', '"').replace('″', '"')
        
        text = re.sub(r'[^\x00-\x7F]', '', text)    
    return text

In [52]:
def find_non_ascii(text):
    if isinstance(text, str):
        # |[\"]
        return bool(re.search(r'[^\x00-\x7F]', text))
    return False

In [53]:
def validate_non_ascii(df):
    non_ascii_entries = df.apply(lambda col: col.apply(find_non_ascii))
    rows_with_non_ascii = df[non_ascii_entries.any(axis=1)]    
    return rows_with_non_ascii

In [54]:
df = pd.read_excel(os.path.join(os.getenv('PURCHASE_ORDER_FOLDER_PATH'), 'Data 1.xlsx'))
df.columns = df.columns.str.replace(' ', '_').str.upper()

In [55]:
df['ITEM_NAME']

0          Bracket no A-081-G MDR Locking Strip
1                                 Locking Plate
2        Charges for embossing tool trial Proto
3                               DR Solution @12
4               During The DR Drill Acrivity @2
                          ...                  
50456                      ZIPPY 25 TOP COVER-1
50457                  ZIPPY 25 FRONT COVER - 1
50458                ZIPPY 25 - SIDE COVER - 01
50459                Tube 200x200x14mm (350YST)
50460                  Tube 140x80x9mm (350YST)
Name: ITEM_NAME, Length: 50461, dtype: object

In [56]:
validate_non_ascii(df[['ITEM_NAME']])

Unnamed: 0,ITEM_NAME
33,11879651Belt 6100 CC _Disa (shot blasting )
34,"5-1002-003Bucket DIN15232-C200x140X2,0GK _..."
35,4-2401-052Connection Elevator Belt _Disa (s...
36,10363906Screw M10x35-8.8 DIN15237 _Disa (sh...
37,10128231Nut M10-6 ZN DIN1587 _Disa (shot bl...
...,...
50289,Dynamo 500 – Side cover –Mould-Al
50290,Dynamo 500 – Side cover – Plug
50299,AMR 100 – Diffuser Mould
50419,ZIPPY 25 - TOP COVER – SEPERATOR - Mould-Al


In [57]:
df_cleaned = df.apply(lambda col: col.apply(clean_text))

In [58]:
df_cleaned['ITEM_NAME']

0          Bracket no A-081-G MDR Locking Strip
1                                 Locking Plate
2        Charges for embossing tool trial Proto
3                               DR Solution @12
4               During The DR Drill Acrivity @2
                          ...                  
50456                      ZIPPY 25 TOP COVER-1
50457                  ZIPPY 25 FRONT COVER - 1
50458                ZIPPY 25 - SIDE COVER - 01
50459                Tube 200x200x14mm (350YST)
50460                  Tube 140x80x9mm (350YST)
Name: ITEM_NAME, Length: 50461, dtype: object

In [59]:
validate_non_ascii(df_cleaned[['ITEM_NAME']])

Unnamed: 0,ITEM_NAME


In [60]:
electrical_parts = pd.read_csv(
    os.path.join(os.getenv('ELECTRICAL_PARTS_FOLDER_PATH'), 'Electrical Parts Report Modified.csv'),
)
electrical_parts.columns = electrical_parts.columns.str.replace(' ', '_').str.upper()

In [61]:
electrical_parts['PART_NAME']

0              CABLE GLAND- DOUBLE COMPRESSION CABLE, M90
1                        E-STOP- LED, SELF MONTORING, 2CH
2                              INTERFACE MODULE-PLC,RS485
3       BRAKE CONNECTOR-FOR HF-SE/SN/SP/JP,HG-SN/SR,HK...
4                                             CONTACT-1NO
                              ...                        
9354                            ÖLFLEX® CLASSIC 100 5G1,5
9355                          ÖLFLEX® CLASSIC 100 H 5G2,5
9356                ÖLFLEX® FD ROBUST 756 C 4 G 2,5+(2x1)
9357                  ÖLFLEX® SERVO 700 4G1,5+(2x0,75)FDF
9358                  ÖLFLEX® SERVO 700 4G2,5+(2x0,75)FDF
Name: PART_NAME, Length: 9359, dtype: object

In [62]:
validate_non_ascii(electrical_parts[['PART_NAME']])

Unnamed: 0,PART_NAME
622,250 µs/ch
650,"45° control valve with flange, heavy series"
651,"45° control valve with flange, light series"
652,"45° control valve with union nut, heavy series"
653,"45° control valve with union nut, light series"
...,...
9354,"ÖLFLEX® CLASSIC 100 5G1,5"
9355,"ÖLFLEX® CLASSIC 100 H 5G2,5"
9356,"ÖLFLEX® FD ROBUST 756 C 4 G 2,5+(2x1)"
9357,"ÖLFLEX® SERVO 700 4G1,5+(2x0,75)FDF"


In [63]:
electrical_parts_cleaned = electrical_parts.apply(lambda col: col.apply(clean_text))

In [64]:
electrical_parts_cleaned['PART_NAME']

0              CABLE GLAND- DOUBLE COMPRESSION CABLE, M90
1                        E-STOP- LED, SELF MONTORING, 2CH
2                              INTERFACE MODULE-PLC,RS485
3       BRAKE CONNECTOR-FOR HF-SE/SN/SP/JP,HG-SN/SR,HK...
4                                             CONTACT-1NO
                              ...                        
9354                              LFLEX CLASSIC 100 5G1,5
9355                            LFLEX CLASSIC 100 H 5G2,5
9356                  LFLEX FD ROBUST 756 C 4 G 2,5+(2x1)
9357                    LFLEX SERVO 700 4G1,5+(2x0,75)FDF
9358                    LFLEX SERVO 700 4G2,5+(2x0,75)FDF
Name: PART_NAME, Length: 9359, dtype: object

In [65]:
validate_non_ascii(electrical_parts_cleaned[['PART_NAME']])

Unnamed: 0,PART_NAME


In [66]:
merged_data = pd.merge(df_cleaned, electrical_parts_cleaned, how="inner", left_on="ITEM_NAME", right_on="PART_NAME")

In [67]:
merged_data.shape

(166112, 80)

In [69]:
filtered_data = merged_data.drop_duplicates(subset=['PO_NUM', 'PART_NAME'], keep='first')
filtered_data.shape

(5399, 80)