In [3]:
import numpy as np
import pandas as pd
import re
import os
from dotenv import load_dotenv

In [4]:
load_dotenv()

True

In [5]:
non_ascii_replacement_dict = {
    '€': '<euro>',
    '¢': '<cent>',
    '₹': '<rupee>',
    '×': 'X',
    '²': '<pow2>',
    '³': '<pow3>',
    '™': '<trademark>',
    '®': '<registered>',
    '–': '<dash>'
}

In [6]:
def clean_text(text):
    if isinstance(text, str):
        text = text.replace('“', '"').replace('”', '"').replace('″', '"')
        text = text.replace('‘', "'").replace('’', "'")

        # replace space and 2 or more double quotes with one & remove space 
        text = re.sub(r'(\d)\s?""*', r'\1"', text)
        
        text = re.sub(r"\s?''", r'"', text)
       
        # replace degree sign with text
        text = re.sub(r'\s?°', r'<deg>', text)

        # replace the non-ascii characters in the dict with their defined replacement         
        for pattern, replace in non_ascii_replacement_dict.items():
            text = re.sub(re.escape(pattern), re.escape(replace), text)

        text = re.sub(r'[^\x00-\x7F]', '', text)    
    return text

In [7]:
def to_uppercase(text):
    if isinstance(text, str):
        text = text.upper()
    return text

In [8]:
def clean_item_code(value):
    # Pattern for 2 letters followed by 5 digits -> ^[A-Za-z]{2}\d{5}$

    value = re.sub(r'^0+', '', value)
    clean_value = value
    
    pattern = r'^(\d{5})(0*)$'
    match = re.search(pattern, value)

    if match:
        clean_value = match.group(1)

    return clean_value


### Load and Clean Purchase Order Line Items Dataset

In [9]:
df = pd.read_excel(os.path.join(os.getenv('PURCHASE_ORDER_FOLDER_PATH'), 'Data 1.xlsx'))
df.columns = df.columns.str.replace(' ', '_').str.upper()

In [10]:
df

Unnamed: 0,#,DOCTYPE,CANCELED,DOCCUR,DOCRATE,DOCSTATUS,PO_NUM,BASE_TYPE,BASE_NUMBER,BASE_LINE,...,MANUALLY_CLOSED_PO,PO_OWNER_NAME,PYMNTGROUP,HEADER,FOOTER,BRANCH_NAME,IMP_OR_EXP,COST_SAVING,COST_SAVING_CRITERIA,FORCE_CLOSED_PO_QTY
0,26968,I,N,INR,1.0,C,232431279,PR,232401368.0,1.0,...,No,Yoginder Kumar,Net 30 days,,,NOIDA,N,,,0.0
1,42455,I,N,INR,1.0,C,232471206,PR,232451256.0,1.0,...,No,Ashutosh Tiwari,Net 30 days,,,Ecotech X,N,,,0.0
2,45582,S,N,INR,1.0,C,232472169,PR,232452086.0,1.0,...,No,Abhishek Chauhan,Refer PO Text,Payment Terms : 100% after service completion ...,,Ecotech X,N,,,0.0
3,9868,S,N,INR,1.0,C,72616,PR,100150.0,1.0,...,Yes,Aman Chambial,Net-30,OTC charges shall be paid after installation o...,,NOIDA,N,,,0.0
4,9869,S,N,INR,1.0,C,72616,PR,100150.0,2.0,...,Yes,Aman Chambial,Net-30,OTC charges shall be paid after installation o...,,NOIDA,N,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50456,48133,I,N,INR,1.0,O,242504101,PR,242500075.0,6.0,...,No,Dheeraj Sachdeva,Net 30 days,,,NOIDA,N,,,0.0
50457,48134,I,N,INR,1.0,O,242504101,PR,242500075.0,4.0,...,No,Dheeraj Sachdeva,Net 30 days,,,NOIDA,N,,,0.0
50458,48135,I,N,INR,1.0,O,242504101,PR,242500075.0,3.0,...,No,Dheeraj Sachdeva,Net 30 days,,,NOIDA,N,,,0.0
50459,16606,I,N,INR,1.0,C,74346,PR,102315.0,1.0,...,No,Sanket Singh Rathor,100% advance against PI,12 mtr weight is 948 kg (200*200) 140*90 it i...,,NOIDA,N,,,0.0


In [11]:
df_cleaned = df.apply(lambda col: col.apply(clean_text))
df_cleaned = df_cleaned.apply(lambda col: col.apply(to_uppercase))

In [12]:
df_cleaned_nan = df_cleaned[df_cleaned['ITEM_CODE'].isna()].copy()
df_cleaned_nan

Unnamed: 0,#,DOCTYPE,CANCELED,DOCCUR,DOCRATE,DOCSTATUS,PO_NUM,BASE_TYPE,BASE_NUMBER,BASE_LINE,...,MANUALLY_CLOSED_PO,PO_OWNER_NAME,PYMNTGROUP,HEADER,FOOTER,BRANCH_NAME,IMP_OR_EXP,COST_SAVING,COST_SAVING_CRITERIA,FORCE_CLOSED_PO_QTY
2,45582,S,N,INR,1.0,C,232472169,PR,232452086.0,1.0,...,NO,ABHISHEK CHAUHAN,REFER PO TEXT,PAYMENT TERMS : 100% AFTER SERVICE COMPLETION ...,,ECOTECH X,N,,,0.0
3,9868,S,N,INR,1.0,C,72616,PR,100150.0,1.0,...,YES,AMAN CHAMBIAL,NET-30,OTC CHARGES SHALL BE PAID AFTER INSTALLATION O...,,NOIDA,N,,,0.0
4,9869,S,N,INR,1.0,C,72616,PR,100150.0,2.0,...,YES,AMAN CHAMBIAL,NET-30,OTC CHARGES SHALL BE PAID AFTER INSTALLATION O...,,NOIDA,N,,,0.0
5,9870,S,N,INR,1.0,C,72616,PR,100150.0,3.0,...,NO,AMAN CHAMBIAL,NET-30,OTC CHARGES SHALL BE PAID AFTER INSTALLATION O...,,NOIDA,N,,,0.0
6,22886,S,N,INR,1.0,C,232420215,PR,232404753.0,1.0,...,NO,NEERAJ TIWARI,NET-30,,,NOIDA,N,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50271,37350,S,N,INR,1.0,C,232450057,PR,232440055.0,4.0,...,NO,DHEERAJ SACHDEVA,NET 30 DAYS,IF ZOOM CONTACT CENTER CLOUD STORAGE 200GB REQ...,,AFSNOIDA,N,,,0.0
50272,37974,S,N,INR,1.0,O,232450263,PR,232440259.0,1.0,...,NO,SANDEEP KUMAR GAUTAM,NET 30 DAYS,,,AFSNOIDA,N,,,0.0
50273,37975,S,N,INR,1.0,O,232450263,PR,232440259.0,2.0,...,NO,SANDEEP KUMAR GAUTAM,NET 30 DAYS,,,AFSNOIDA,N,,,0.0
50274,37976,S,N,INR,1.0,O,232450263,PR,232440259.0,3.0,...,NO,SANDEEP KUMAR GAUTAM,NET 30 DAYS,,,AFSNOIDA,N,,,0.0


In [13]:
df_cleaned_non_nan = df_cleaned[df_cleaned['ITEM_CODE'].notna()].copy()
df_cleaned_non_nan.shape

(45209, 59)

In [14]:
df_cleaned_non_nan['ITEM_CODE_CLEANED'] = df_cleaned_non_nan['ITEM_CODE'].astype(str).apply(clean_item_code)
df_cleaned_non_nan[['ITEM_CODE', 'ITEM_CODE_CLEANED']]

Unnamed: 0,ITEM_CODE,ITEM_CODE_CLEANED
0,MC11560,MC11560
1,4556800,45568
11,ST00545,ST00545
12,ST00546,ST00546
13,GC01436,GC01436
...,...,...
50456,00047513A,47513A
50457,00047494A,47494A
50458,00047510A,47510A
50459,3079000,30790


In [175]:
po_grouped_df = df_cleaned_nan.groupby('PO_NUM')['ITEM_CODE'].apply(lambda x: x.isna().sum())

# count PO_NUN having >= 1 NaN ITEM_CODE and count individual NaN ITEM_CODE
po_count = (po_grouped_df > 0).sum()
item_code_count = po_grouped_df.sum()

print(po_count, item_code_count)

2462 5252


In [15]:
def keep_most_quantity(group):
    return group.loc[group['ORDERED_QUANTITY'].idxmax()]

In [16]:
# df_cleaned_filtered = df_cleaned_non_nan.drop_duplicates(subset=['PO_NUM', 'ITEM_CODE_CLEANED'], keep='first')

# grouping by the columns and keeping the row with most ordered quantity - WAY TO DROP DUPLICATE WITH CUSTOM LOGIC
non_nan_filtered = df_cleaned_non_nan.groupby(['PO_NUM', 'ITEM_CODE_CLEANED']).apply(keep_most_quantity).reset_index(drop=True)
non_nan_filtered.shape

  non_nan_filtered = df_cleaned_non_nan.groupby(['PO_NUM', 'ITEM_CODE_CLEANED']).apply(keep_most_quantity).reset_index(drop=True)


(42159, 60)

#### Find repeated Supplier Name against Supplier Code

In [48]:
def find_repeating_supplier_names(target_df):
    multiple_codes = target_df.groupby('SUPPLIER_NAME')['SUPPLIER_CODE'].unique()
    problematic_names = multiple_codes[multiple_codes.apply(lambda codes: len(codes) > 1)]
    return problematic_names

In [49]:
find_repeating_supplier_names(df_cleaned_non_nan)

SUPPLIER_NAME
ALPINE INDUSTRIAL SOLUTION                                                    [VCD000045, VD002001]
BAR CODE INDIA LIMITED                                                         [VD001568, VO000095]
DIGI-KEY ELECTRONICS..                                                         [VF010009, VO000101]
DISA INDIA LIMITED                                                            [VCD000027, VD004153]
ELECTROMECH FIRE FIGHTERS PVT. LTD.                                           [VCD000012, VD001831]
EVOKE GLOBAL                                                                  [VCD000116, VD003868]
GAINWELL COMMOSALES PRIVATE LIMITED                                           [VCD000011, VD004294]
GURUJI STEEL TRADING                                                          [VD002397, VCD000106]
INDOTECH ENGINEERS                                                            [VCD000136, VD002923]
IRUS ENGINEERING PVT LTD                                                      [VCD0000

In [52]:
# df_cleaned_non_nan[df_cleaned_non_nan['SUPPLIER_CODE'].isin(['VD000223', 'VLC002025'])]
df_cleaned_non_nan[df_cleaned_non_nan['SUPPLIER_CODE'].isin(['VCD000130', 'VD003949'])]

Unnamed: 0,#,DOCTYPE,CANCELED,DOCCUR,DOCRATE,DOCSTATUS,PO_NUM,BASE_TYPE,BASE_NUMBER,BASE_LINE,...,PO_OWNER_NAME,PYMNTGROUP,HEADER,FOOTER,BRANCH_NAME,IMP_OR_EXP,COST_SAVING,COST_SAVING_CRITERIA,FORCE_CLOSED_PO_QTY,ITEM_CODE_CLEANED
39678,4514,I,N,INR,1.0,O,60330,PR,102638.0,18.0,...,PRIYANSHI SAXENA,NET 45 DAYS,CONTACT DETAILS: MR. MANI BHUSHAN JHA M/S SEW-...,,NOIDA,N,,,0.0,29329
39679,4515,I,N,INR,1.0,O,60330,PR,102638.0,17.0,...,PRIYANSHI SAXENA,NET 45 DAYS,CONTACT DETAILS: MR. MANI BHUSHAN JHA M/S SEW-...,,NOIDA,N,,,0.0,29330
39680,4516,I,N,INR,1.0,O,60330,PR,102638.0,1.0,...,PRIYANSHI SAXENA,NET 45 DAYS,CONTACT DETAILS: MR. MANI BHUSHAN JHA M/S SEW-...,,NOIDA,N,,,0.0,28827
39681,4517,I,N,INR,1.0,O,60330,PR,102638.0,2.0,...,PRIYANSHI SAXENA,NET 45 DAYS,CONTACT DETAILS: MR. MANI BHUSHAN JHA M/S SEW-...,,NOIDA,N,,,0.0,28828
39682,4518,I,N,INR,1.0,O,60330,PR,102638.0,3.0,...,PRIYANSHI SAXENA,NET 45 DAYS,CONTACT DETAILS: MR. MANI BHUSHAN JHA M/S SEW-...,,NOIDA,N,,,0.0,28829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39886,49264,I,N,INR,1.0,C,242530222,PR,242550194.0,1.0,...,ASHUTOSH TIWARI,NET 30 DAYS,,,ECOTECH X,N,,,0.0,45897
39887,49354,I,N,INR,1.0,O,242530260,PR,242550031.0,2.0,...,SHOBHIT KHANDELWAL,NET 30 DAYS,,,ECOTECH X,N,,,0.0,52950
39888,49549,I,N,INR,1.0,O,242530332,PR,242550207.0,21.0,...,SHOBHIT KHANDELWAL,NET 30 DAYS,,,ECOTECH X,N,,,0.0,17070
39889,50002,I,N,INR,1.0,O,242530517,PR,242550401.0,3.0,...,ASHUTOSH TIWARI,NET 30 DAYS,,,ECOTECH X,N,,,0.0,38631


#### Find and Analyse Duplicates

In [141]:
duplicates = df_cleaned_non_nan[df_cleaned_non_nan.duplicated(subset=['PO_NUM', 'ITEM_CODE_CLEANED'], keep=False)]
columns_to_check = ['DOC_DATE', 'DELIVERY_DATE', 'PO_VALUE', 'ORDERED_QUANTITY', 'PRICE', 'ITEM_VALUE', 'FREIGHT', 'TAX_AMOUNT(LC)']

differences = duplicates.groupby(['PO_NUM', 'ITEM_CODE_CLEANED']).apply(lambda group: group[columns_to_check].nunique() > 1)
# differences = differences[differences.any(axis=1)]
differences

  differences = duplicates.groupby(['PO_NUM', 'ITEM_CODE_CLEANED']).apply(lambda group: group[columns_to_check].nunique() > 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,DOC_DATE,DELIVERY_DATE,PO_VALUE,ORDERED_QUANTITY,PRICE,ITEM_VALUE,FREIGHT,TAX_AMOUNT(LC)
PO_NUM,ITEM_CODE_CLEANED,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
53985,17379,False,False,False,True,False,True,False,True
54483,MC1547,False,False,False,False,False,False,False,False
54813,HK00108,False,False,False,True,False,True,False,True
55053,ELC10377,False,False,False,True,True,True,False,False
55376,EL13183,False,False,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...
242530533,18748,False,False,False,True,False,True,False,True
242530533,22798,False,False,False,True,False,True,False,True
242530533,3765,False,False,False,True,False,True,False,True
242530581,17888,False,False,False,True,False,True,False,True


In [150]:
df_cleaned_non_nan[(df_cleaned_non_nan['PO_NUM'] == 53985) & (df_cleaned_non_nan['ITEM_CODE_CLEANED'] == '17379')]

Unnamed: 0,#,DOCTYPE,CANCELED,DOCCUR,DOCRATE,DOCSTATUS,PO_NUM,BASE_TYPE,BASE_NUMBER,BASE_LINE,...,PO_OWNER_NAME,PYMNTGROUP,HEADER,FOOTER,BRANCH_NAME,IMP_OR_EXP,COST_SAVING,COST_SAVING_CRITERIA,FORCE_CLOSED_PO_QTY,ITEM_CODE_CLEANED
30720,24,I,N,INR,1.0,C,53985,-1,,,...,AKUL PRABHAKAR KADAM,NET-30,"ACTUAL PRICE FOR 00017381 IS 600, WE HAVE AMEN...",,NOIDA,N,,,0.0,17379
30721,25,I,N,INR,1.0,C,53985,-1,,,...,AKUL PRABHAKAR KADAM,NET-30,"ACTUAL PRICE FOR 00017381 IS 600, WE HAVE AMEN...",,NOIDA,N,,,0.0,17379
30722,26,I,N,INR,1.0,C,53985,-1,,,...,AKUL PRABHAKAR KADAM,NET-30,"ACTUAL PRICE FOR 00017381 IS 600, WE HAVE AMEN...",,NOIDA,N,,,0.0,17379


In [151]:
df_cleaned_non_nan[(df_cleaned_non_nan['PO_NUM'] == 55053) & (df_cleaned_non_nan['ITEM_CODE_CLEANED'] == 'ELC10377')]

Unnamed: 0,#,DOCTYPE,CANCELED,DOCCUR,DOCRATE,DOCSTATUS,PO_NUM,BASE_TYPE,BASE_NUMBER,BASE_LINE,...,PO_OWNER_NAME,PYMNTGROUP,HEADER,FOOTER,BRANCH_NAME,IMP_OR_EXP,COST_SAVING,COST_SAVING_CRITERIA,FORCE_CLOSED_PO_QTY,ITEM_CODE_CLEANED
4351,184,I,N,USD,75.73,C,55053,-1,,,...,KIRAN -,NET-30,,,NOIDA,Y,,,36000.0,ELC10377
4352,185,I,N,USD,75.73,C,55053,-1,,,...,KIRAN -,NET-30,,,NOIDA,Y,,,9000.0,ELC10377


### Load and Clean Electrical Parts Catalogue Dataset

In [17]:
electrical_parts = pd.read_csv(
    os.path.join(os.getenv('ELECTRICAL_PARTS_FOLDER_PATH'), 'Electrical Parts Report Modified.csv'),
)
electrical_parts.columns = electrical_parts.columns.str.replace(' ', '_').str.upper()

In [18]:
electrical_parts.shape

(9359, 21)

In [19]:
electrical_parts_cleaned = pd.DataFrame(electrical_parts.apply(lambda col: col.apply(clean_text)))
electrical_parts_cleaned = electrical_parts_cleaned.apply(lambda col: col.apply(to_uppercase))

In [20]:
electrical_parts_cleaned['PART_ID_CLEANED'] = electrical_parts_cleaned['PART_ID'].astype(str).apply(clean_item_code)
electrical_parts_cleaned[['PART_ID', 'PART_ID_CLEANED']]

Unnamed: 0,PART_ID,PART_ID_CLEANED
0,41237,41237
1,52897,52897
2,54675,54675
3,57680,57680
4,64912,64912
...,...,...
9354,46980,46980
9355,46984,46984
9356,46987,46987
9357,46985,46985


In [21]:
electrical_parts_cleaned[electrical_parts_cleaned.duplicated(subset='PART_ID_CLEANED', keep=False)].sort_values(['PART_ID_CLEANED', 'PART_REVISION'])

Unnamed: 0,PART_ID,PART_NAME,PART_DESCRIPTION,PART_OWNER,PART_REVISION,REVISION_DATE,PRODUCT_GROUP,PRODUCT_SUBGROUP,WIDTH_(MM),HEIGHT_(MM),...,WEIGHT_(KG),MANUFACTURER,MANUFACTURER_PART_NUMBER,EPLAN_PART_NUMBER,OLD_PLM_ID,OLD_SAP_ID,RELEASED_STATUS,RELEASED_DATE,PART_CATEGORY,PART_ID_CLEANED
8498,14948,TERMINAL BLOCK,"MINI FEED-THROUGH TERMINAL BLOCK, NOM. VOLTAGE...",KUMARI MONIKA (KUMARIMONIKA),0,18-OCT-2022 11:36,3.0,1.0,0.0,0.0,...,0.0,PHOENIX.C,552024,PXC.552024,,,DEVELOPMENT,22-NOV-2022 11:56,ELECTRICAL PART,14948
8576,14948,"TERMINAL BLOCKS SD,CROSS SECTION (0.5 - 10SQMM...",MINI FEED-THROUGH TERMINAL BLOCK CROSS SECTION...,KUMARI MONIKA (KUMARIMONIKA),1,22-SEP-2023 18:32,3.0,1.0,0.0,0.0,...,0.0,PHOENIX.C,552024,PXC.552024,,,PRODUCTION,26-SEP-2023 12:02,ELECTRICAL PART,14948
4338,15244,DISTRIBUTION BLOCK,"DISTRIBUTION BLOCK, NOMINAL CURRENT: 41 A, CON...",KUMARI MONIKA (KUMARIMONIKA),0,18-OCT-2022 11:38,3.0,1.0,0.0,0.0,...,0.0,PHOENIX.C,1046985,PXC.1046985,,,DEVELOPMENT,22-NOV-2022 11:56,ELECTRICAL PART,15244
4343,15244,DISTRIBUTION BLOCK,"DISTRIBUTION BLOCK, NOMINAL CURRENT: 41 A, CON...",KUMARI MONIKA (KUMARIMONIKA),1,22-SEP-2023 18:08,3.0,1.0,0.0,0.0,...,0.0,PHOENIX.C,1046985,PXC.1046985,,,PRODUCTION,27-SEP-2023 17:43,ELECTRICAL PART,15244
4937,17190,FLIGHT CONTROLLER,AIKON F42020 FLIGHT CONTROLLER,KUMARI MONIKA (KUMARIMONIKA),0,19-OCT-2022 15:46,19.0,1.0,0.0,0.0,...,0.0,AIKON,F42020,AIK.F42020,,,DEVELOPMENT,22-NOV-2022 11:56,ELECTRICAL PART,17190
4938,17190,FLIGHT CONTROLLER,AIKON F42020 FLIGHT CONTROLLER,KUMARI MONIKA (KUMARIMONIKA),1,05-DEC-2023 13:40,19.0,1.0,0.0,0.0,...,0.0,AIKON,F42020,AIK.F42020,,,,,ELECTRICAL PART,17190
5706,17437,LIDAR,"SINGLE POINT LIDAR, 0.05M TO 40M, FOV 3 DEGREE...",KUMARI MONIKA (KUMARIMONIKA),0,19-OCT-2022 15:48,12.0,1.0,0.0,0.0,...,0.0,BENEWAKE,TF02PRO,BENE.TF02PRO,,,DEVELOPMENT,22-NOV-2022 11:56,ELECTRICAL PART,17437
5715,17437,"LIDAR-1D,COMMUNICATION PROTOCOL (UART), NON SA...","SINGLE POINT LIDAR, 0.05M TO 40M, FOV 3 DEGREE...",KUMARI MONIKA (KUMARIMONIKA),1,22-SEP-2023 18:06,12.0,1.0,0.0,0.0,...,0.0,BENEWAKE,TF02PRO,BENE.TF02PRO,,,PRODUCTION,26-SEP-2023 12:02,ELECTRICAL PART,17437
795,18274,ANTENNA,ANTENNAS 2.4GHZ/5GHZ WRT SERIES LOW PROFILE DO...,KUMARI MONIKA (KUMARIMONIKA),0,21-OCT-2022 10:15,8.0,1.0,0.0,0.0,...,0.0,LINX.T,ANT-DB1-WRT-RPS,LNX.ANT-DB1-WRT-RPS,,,DEVELOPMENT,22-NOV-2022 11:56,ELECTRICAL PART,18274
810,18274,"ANTENNA-OMNIDIRECTIONAL,2.4 & 5GHZ, 3.2DB",ANTENNAS 2.4GHZ/5GHZ WRT SERIES LOW PROFILE DO...,KUMARI MONIKA (KUMARIMONIKA),1,22-SEP-2023 18:41,8.0,1.0,0.0,0.0,...,0.0,LINX.T,ANT-DB1-WRT-RPS,LNX.ANT-DB1-WRT-RPS,,,PRODUCTION,26-SEP-2023 12:02,ELECTRICAL PART,18274


In [22]:
def keep_latest_revision(group):
    return group.loc[group['PART_REVISION'].idxmax()]

In [23]:
electrical_cleaned_filtered = electrical_parts_cleaned.groupby('PART_ID_CLEANED').apply(keep_latest_revision).reset_index(drop=True)
electrical_cleaned_filtered

  electrical_cleaned_filtered = electrical_parts_cleaned.groupby('PART_ID_CLEANED').apply(keep_latest_revision).reset_index(drop=True)


Unnamed: 0,PART_ID,PART_NAME,PART_DESCRIPTION,PART_OWNER,PART_REVISION,REVISION_DATE,PRODUCT_GROUP,PRODUCT_SUBGROUP,WIDTH_(MM),HEIGHT_(MM),...,WEIGHT_(KG),MANUFACTURER,MANUFACTURER_PART_NUMBER,EPLAN_PART_NUMBER,OLD_PLM_ID,OLD_SAP_ID,RELEASED_STATUS,RELEASED_DATE,PART_CATEGORY,PART_ID_CLEANED
0,10071,10071,,HIMANSHU MEHTA (HIMANSHUMEHTA),0,16-APR-2024 15:45,,,,,...,,,,,,,,,ELECTRICAL PART,10071
1,14730,MOTOR,"100W, 24VDC BLDC MOTOR, 90MM FRAME, 2500RPM,6A...",KUMARI MONIKA (KUMARIMONIKA),0,13-OCT-2022 20:20,9.0,1.0,0.0,0.0,...,0.0,ORIENTAL,BLHM5100K-GFS,ORM.BLHM5100K-GFS,,,PRODUCTION,18-JAN-2023 20:06,ELECTRICAL PART,14730
2,14857,"CABLE,M8 RIGHT ANGLED TO FREE LEADS,2M","SENSOR/ACTUATOR CABLE, 3-POSITION, PVC, YELLOW...",KUMARI MONIKA (KUMARIMONIKA),0,18-OCT-2022 11:35,29.0,184.0,0.0,0.0,...,0.0,PHEONIX.C,1406321,PXC.1406321,,,,,ELECTRICAL PART,14857
3,14858,CONNECTOR,"CONNECTOR, UNIVERSAL, 5-POSITION, UNSHIELDED, ...",KUMARI MONIKA (KUMARIMONIKA),0,18-OCT-2022 11:35,4.0,215.0,0.0,0.0,...,0.0,PHEONIX.C,1424689,PXC.1424689,,,PRODUCTION,13-JUL-2024 21:11,ELECTRICAL PART,14858
4,14859,"AIL ADAPTER FOR M3 SCREWS,,BLACK","WIDTH10 MM, HEIGHT19 MM, LENGTH42.6 MM",KUMARI MONIKA (KUMARIMONIKA),0,18-OCT-2022 11:35,17.0,1.0,0.0,0.0,...,0.0,PHEONIX.C,1200993,PXC.1200993,,,,,ELECTRICAL PART,14859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9346,65209,"PRE FAB CABLE-3M, WIRE HARNESS, ECONOSEAL, FRE...",PRE FABRICATED AUTOMOTIVE CONNECTORS WITH FLYI...,KUMARI MONIKA (KUMARIMONIKA),0,25-SEP-2024 17:40,29.0,1.0,0.0,0.0,...,0.0,MOTHERSON,AT0699AA,MSSL.AT0699AA,,,PRODUCTION,26-SEP-2024 10:15,ELECTRICAL PART,65209
9347,65210,CONNECTOR- RJ45,RJ45 90 DEGREE CONNECTOR,KUMARI MONIKA (KUMARIMONIKA),0,25-SEP-2024 17:40,17.0,1.0,0.0,0.0,...,0.0,PHOENIX.C,1421128,PXC.CUC-IND - C1ZNI - T/R4IP8,,,PRODUCTION,26-SEP-2024 10:15,ELECTRICAL PART,65210
9348,65220,"LUG-CU, TUBE, 10MM2",CABLE LUG; SUITABLE CRIMPING INSERT: B7; FOR C...,KUMARI MONIKA (KUMARIMONIKA),0,25-SEP-2024 17:54,17.0,1.0,10.0,29.0,...,0.0,LAPP,61796650,LAPP.61796650,,,PRODUCTION,26-SEP-2024 10:15,ELECTRICAL PART,65220
9349,65221,"COMM MOD-WIFI,CLIENT, TRX A/B, PCI EXPRESS ,20M",WIFI5 11AC 2TX2R + BT (V5.0 LE) COMBO MODULE W...,KUMARI MONIKA (KUMARIMONIKA),0,25-SEP-2024 17:56,8.0,1.0,26.8,30.0,...,0.0,ENLI,Q6174AH,ENL-Q6174AH,,,,,ELECTRICAL PART,65221


In [24]:
electrical_cleaned_filtered['PART_ID'].nunique()

9351

### Merge the 2 cleaned datasets on Item Code & Part Id 

In [25]:
merged_data_item_code = pd.merge(non_nan_filtered, electrical_cleaned_filtered, how="inner", left_on="ITEM_CODE_CLEANED", right_on="PART_ID_CLEANED")

In [26]:
merged_data_item_code.shape

(9796, 82)

In [27]:
# verifying that no repetition is there
merged_data_item_code[merged_data_item_code['PO_NUM'] == 60186]

Unnamed: 0,#,DOCTYPE,CANCELED,DOCCUR,DOCRATE,DOCSTATUS,PO_NUM,BASE_TYPE,BASE_NUMBER,BASE_LINE,...,WEIGHT_(KG),MANUFACTURER,MANUFACTURER_PART_NUMBER,EPLAN_PART_NUMBER,OLD_PLM_ID,OLD_SAP_ID,RELEASED_STATUS,RELEASED_DATE,PART_CATEGORY,PART_ID_CLEANED
948,3874,I,N,INR,1.0,C,60186,-1,,,...,,ICOTEK,50741,ICO.50741,,,PRODUCTION,17-JUL-2023 16:17,ELECTRICAL PART,18165


In [28]:
important_columns = ['PO_NUM', 'DOCCUR', 'DOCRATE', 'SUPPLIER_CODE', 'SUPPLIER_NAME', 'DOC_DATE', 'DELIVERY_DATE', 'PO_VALUE', 'LOCATION', 'ITEM_CODE_CLEANED', 'ITEM_NAME', 'ITEM_DETAILS', 'UOM', 'ORDERED_QUANTITY', 'PRICE', 'ITEM_VALUE', 'TAX_AMOUNT(LC)', 'LINETOTAL_WITH_TAX_(LC)', 'PART_ID_CLEANED', 'PART_NAME', 'PART_DESCRIPTION', 'WIDTH_(MM)', 'HEIGHT_(MM)', 'DEPTH_(MM)', 'MOUNTING_CLEARANCES_(MM)', 'WEIGHT_(KG)']
filtered_data_final = merged_data_item_code[important_columns]
filtered_data_final

Unnamed: 0,PO_NUM,DOCCUR,DOCRATE,SUPPLIER_CODE,SUPPLIER_NAME,DOC_DATE,DELIVERY_DATE,PO_VALUE,LOCATION,ITEM_CODE_CLEANED,...,TAX_AMOUNT(LC),LINETOTAL_WITH_TAX_(LC),PART_ID_CLEANED,PART_NAME,PART_DESCRIPTION,WIDTH_(MM),HEIGHT_(MM),DEPTH_(MM),MOUNTING_CLEARANCES_(MM),WEIGHT_(KG)
0,53586,INR,1.0000,VD001614,ITG SOFTWARE ENGINEERING (I) PVT. LTD,2021-10-01,2022-11-30,2.764740e+07,SEA,17340,...,3807000.0,2.495700e+07,17340,SINGLE BOARD COMPUTER,SINGLE BOARD COMPUTERS UP SQUARED BOARD WITH C...,0.0,0.0,0.0,,0.000
1,53985,INR,1.0000,VD000561,PEPPERL+FUCHS FACTORY AUTOMATION PVT. LTD,2021-10-19,2022-03-30,5.073832e+06,JAGATSINGHAPUR,17360,...,3013.2,1.975320e+04,17360,FEMALE CONNECTOR M12 STRAIGHT A-CODED 8-PIN,"FEMALE CONNECTOR, FIELD-ATTACHABLE V19-G-ABG-P...",,,,,
2,53985,INR,1.0000,VD000561,PEPPERL+FUCHS FACTORY AUTOMATION PVT. LTD,2021-10-19,2022-03-30,5.073832e+06,JAGATSINGHAPUR,17365,...,80352.0,5.267520e+05,17365,CAMERA BASEDD LINEAR POSITIONING SENSOR,,,,,,
3,53985,INR,1.0000,VD000561,PEPPERL+FUCHS FACTORY AUTOMATION PVT. LTD,2021-10-19,2022-03-30,5.073832e+06,JAGATSINGHAPUR,17376,...,90720.0,5.947200e+05,17376,DISTANCE SENSOR,OPERATING VOLTAGE UB 10 ... 30 V DC / WHEN OPE...,,,,,
4,53985,INR,1.0000,VD000561,PEPPERL+FUCHS FACTORY AUTOMATION PVT. LTD,2021-10-19,2022-03-30,5.073832e+06,JAGATSINGHAPUR,17378,...,14774.4,9.685440e+04,17378,"FEMALE CORDSET, FIELD ATTACHABLE",OPERATING VOLTAGE UB MAX. 30 V DC OPERATING CU...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9791,242530635,JPY,0.5358,VF003119,"ITOH DENKI CO., LTD.",2024-05-14,2024-06-25,3.135394e+05,USA,50340,...,0.0,2.829024e+03,50340,"CONNECTOR-10P,FEMALE,6A,0.5SQMM,LIGHT GREY",CONDUCTOR FEMALE CONNECTOR; 0.5 MM<POW2>; PIN ...,0.0,0.0,0.0,,0.005
9792,242530635,JPY,0.5358,VF003119,"ITOH DENKI CO., LTD.",2024-05-14,2024-06-25,3.135394e+05,USA,55993,...,0.0,1.420406e+05,55993,CONTROLLER-24V DC,"CONTROLLER CARD,24V DC,IP20,10.8 MS MOTOR STAR...",65.0,233.0,0.0,,0.000
9793,242530635,JPY,0.5358,VF003119,"ITOH DENKI CO., LTD.",2024-05-14,2024-06-25,3.135394e+05,USA,55994,...,0.0,6.886102e+04,55994,CONTROLLER-24V DC,"CONTROLLER CARD,24V DC,IP20,43.2 MS MOTOR STAR...",84.3,160.0,30.5,,0.000
9794,242530635,JPY,0.5358,VF003119,"ITOH DENKI CO., LTD.",2024-05-14,2024-06-25,3.135394e+05,USA,56001,...,0.0,1.093032e+03,56001,"CONNECTOR-3P,FEMALE,10A,1.5 MM2,LICHTGRAU","FEMALE PLUG PIN SPACING 3.5 MM,0.08 TO 1.5 MM2...",12.7,13.4,18.6,,2.458


In [234]:
filtered_data_final.to_csv('filtered_data_final.csv')

In [29]:
filtered_data_final[['SUPPLIER_CODE', 'SUPPLIER_NAME']].nunique()

SUPPLIER_CODE    261
SUPPLIER_NAME    260
dtype: int64

In [38]:
null_suppliers = filtered_data_final[filtered_data_final['SUPPLIER_NAME'].isnull()]
null_suppliers

Unnamed: 0,PO_NUM,DOCCUR,DOCRATE,SUPPLIER_CODE,SUPPLIER_NAME,DOC_DATE,DELIVERY_DATE,PO_VALUE,LOCATION,ITEM_CODE_CLEANED,...,TAX_AMOUNT(LC),LINETOTAL_WITH_TAX_(LC),PART_ID_CLEANED,PART_NAME,PART_DESCRIPTION,WIDTH_(MM),HEIGHT_(MM),DEPTH_(MM),MOUNTING_CLEARANCES_(MM),WEIGHT_(KG)


In [50]:
# duplicate_names = filtered_data_final[filtered_data_final.duplicated('SUPPLIER_NAME', keep=False)]
find_repeating_supplier_names(filtered_data_final)

SUPPLIER_NAME
SEW EURODRIVE INDIA P LTD    [VCD000130, VD003949]
TOSHIBA INDIA PVT LTD        [VD000223, VLC002025]
Name: SUPPLIER_CODE, dtype: object

### Merge the 2 datasets on Item Name & Part Name

In [178]:
merged_data_item_name = pd.merge(df_cleaned_nan, electrical_cleaned_filtered, how="inner", left_on="ITEM_NAME", right_on="PART_NAME")
merged_data_item_name

Unnamed: 0,#,DOCTYPE,CANCELED,DOCCUR,DOCRATE,DOCSTATUS,PO_NUM,BASE_TYPE,BASE_NUMBER,BASE_LINE,...,WEIGHT_(KG),MANUFACTURER,MANUFACTURER_PART_NUMBER,EPLAN_PART_NUMBER,OLD_PLM_ID,OLD_SAP_ID,RELEASED_STATUS,RELEASED_DATE,PART_CATEGORY,PART_ID_CLEANED
0,30747,S,N,INR,1.0,C,232432530,PR,232402827.0,4.0,...,,SONI,PWS-D/S-CA,SONI.PWS-D/S-CA,,,PRODUCTION,13-FEB-2023 12:31,ELECTRICAL PART,31151
1,30747,S,N,INR,1.0,C,232432530,PR,232402827.0,4.0,...,,SONI,PWS-440/230-CA,SONI.PWS-440/230-CA,,,PRODUCTION,13-FEB-2023 12:31,ELECTRICAL PART,31152
2,30747,S,N,INR,1.0,C,232432530,PR,232402827.0,4.0,...,,SONI,PWS-230-CA,SONI.PWS-230-CA,,,PRODUCTION,13-FEB-2023 12:31,ELECTRICAL PART,31153
3,30747,S,N,INR,1.0,C,232432530,PR,232402827.0,4.0,...,,SONI,PWS-440-CA,SONI.PWS-440-CA,,,PRODUCTION,13-FEB-2023 12:31,ELECTRICAL PART,31154
4,30747,S,N,INR,1.0,C,232432530,PR,232402827.0,4.0,...,,SONI,PWS-AS-CA,SONI.PWS-AS-CA,,,PRODUCTION,13-FEB-2023 12:31,ELECTRICAL PART,31155
5,30747,S,N,INR,1.0,C,232432530,PR,232402827.0,4.0,...,,SONI,PWS-480/110-CA,SONI.PWS-480/110-CA,,,PRODUCTION,13-FEB-2023 12:31,ELECTRICAL PART,31158
6,30747,S,N,INR,1.0,C,232432530,PR,232402827.0,4.0,...,,SONI,PWS-480-CA,SONI.PWS-480-CA,,,PRODUCTION,13-FEB-2023 12:31,ELECTRICAL PART,31159
7,30747,S,N,INR,1.0,C,232432530,PR,232402827.0,4.0,...,,SONI,PWS-110-CA,SONI.PWS-110-CA,,,PRODUCTION,16-FEB-2023 13:38,ELECTRICAL PART,3116
8,30747,S,N,INR,1.0,C,232432530,PR,232402827.0,4.0,...,,SONI,PWS-400/230-CA,SONI.PWS-400/230-CA,,,PRODUCTION,17-APR-2023 10:50,ELECTRICAL PART,3392
9,30747,S,N,INR,1.0,C,232432530,PR,232402827.0,4.0,...,,SONI,PWS-400-CA,SONI.PWS-400-CA,,,PRODUCTION,17-APR-2023 10:50,ELECTRICAL PART,33921


#### Checking if the NaN item code unique PO Nums match the merged data PO Nums

In [218]:
nan_item_code_po_nums = df_cleaned_nan['PO_NUM'].unique()
len(nan_item_code_po_nums)

2462

In [219]:
merged_data_po_nums = merged_data_item_code['PO_NUM'].unique()
len(merged_data_po_nums)

3825

In [221]:
print(len(np.union1d(nan_item_code_po_nums, merged_data_po_nums)))
print(len(np.intersect1d(nan_item_code_po_nums, merged_data_po_nums)))

6287
0
