In [83]:
import pandas as pd 
import numpy as np 

certs = pd.read_csv('./cleaned_datasets/corrected_supplier_certifications.csv')

In [84]:
print(certs.shape)
certs.head()

(1960, 3)


Unnamed: 0,SUPPLIER_NAME,ITEM_NAME,CERTIFICATIONS
0,Pepperl+Fuchs Factory Automation Pvt. Ltd,robotics arm,"US, Canada, Europe"
1,PHOENIX CONTACT INDIA PVT LTD,hydraulic actuator,Global Certification
2,ELECTRICAL CONTROL SYSTEMS PVT. LTD.,frequency inverter,"US, Canada, Europe"
3,EVOKE GLOBAL,servo amplifier,Global Certification
4,KUDAMM CORPORATION,DC motor,Global Certification


In [85]:
print(f"# of unique suppliers: {certs['SUPPLIER_NAME'].nunique()}")

# of unique suppliers: 40


In [86]:
set_certs = set()
certs['CERTIFICATIONS'] = certs['CERTIFICATIONS'].apply(lambda x: str(x).split(','))
certs

Unnamed: 0,SUPPLIER_NAME,ITEM_NAME,CERTIFICATIONS
0,Pepperl+Fuchs Factory Automation Pvt. Ltd,robotics arm,"[US, Canada, Europe]"
1,PHOENIX CONTACT INDIA PVT LTD,hydraulic actuator,[Global Certification]
2,ELECTRICAL CONTROL SYSTEMS PVT. LTD.,frequency inverter,"[US, Canada, Europe]"
3,EVOKE GLOBAL,servo amplifier,[Global Certification]
4,KUDAMM CORPORATION,DC motor,[Global Certification]
...,...,...,...
1955,RITTAL INDIA PRIVATE LIMITED,AC motor,[Global Certification]
1956,NETWORK ELECTRONICS,frequency inverter,"[Australia, India]"
1957,Kinco ElectricShenzhenLtd.,infrared camera,"[US, Canada, Europe]"
1958,"SHANGHAI QUISURE INFO-TECH CO.,LTD",VFD drive,"[Australia, India]"


In [87]:

# turn the CERTIFICATIONS column into list type and then go through items in the lists for all values and create a cummulative set that contains all unique values
for cert_list  in certs['CERTIFICATIONS']:
    for cert in cert_list:
        set_certs.add(cert.strip())



In [88]:
set_certs

{'Asia',
 'Australia',
 'Canada',
 'Europe',
 'Global Certification',
 'India',
 'US'}

In [89]:
import datetime 
from datetime import timedelta
pr_df = pd.read_csv('./cleaned_datasets/enlarged_dataset.csv')
pr_df["DOWNPAYMENT_DATE"] = pd.to_datetime(pr_df["DOWNPAYMENT_DATE"])
pr_df["DELIVERY_DATE"] = pd.to_datetime(pr_df["DELIVERY_DATE"])
pr_df['DAYS_TAKEN_TO_DELIVER'] =  (pr_df["DELIVERY_DATE"] - pr_df["DOWNPAYMENT_DATE"]).dt.days
pr_df['FAULTED_PARTS_RATE'] = pr_df['FAULTED_PARTS'] / pr_df['ORDERED_QUANTITY']
pr_df.dtypes

PO_NUM                            int64
ITEM_NAME                        object
PART_DESCRIPTION                 object
ITEM_CODE                        object
SUPPLIER_NAME                    object
SUPPLIER_CODE                    object
ORDERED_QUANTITY                  int64
FAULTED_PARTS                     int64
PRICE                           float64
PO_VALUE                        float64
DOWNPAYMENT_DATE         datetime64[ns]
DELIVERY_DATE            datetime64[ns]
DAYS_TAKEN_TO_DELIVER             int64
FAULTED_PARTS_RATE              float64
dtype: object

In [90]:
pr_df.shape

(58667, 14)

In [91]:

unique_suppliercode_itemcodes = pr_df.groupby(['SUPPLIER_CODE','ITEM_CODE','PART_DESCRIPTION','ITEM_NAME','SUPPLIER_NAME']).agg({
    'PRICE': ['mean', 'min', 'max'],
    'DAYS_TAKEN_TO_DELIVER': ['mean', 'min', 'max'],
    'FAULTED_PARTS_RATE': ['mean', 'min', 'max']
}).reset_index()


In [92]:
unique_suppliercode_itemcodes.columns = ['_'.join(col).strip() if col[1] else col[0] for col in unique_suppliercode_itemcodes.columns.values]
unique_suppliercode_itemcodes

Unnamed: 0,SUPPLIER_CODE,ITEM_CODE,PART_DESCRIPTION,ITEM_NAME,SUPPLIER_NAME,PRICE_mean,PRICE_min,PRICE_max,DAYS_TAKEN_TO_DELIVER_mean,DAYS_TAKEN_TO_DELIVER_min,DAYS_TAKEN_TO_DELIVER_max,FAULTED_PARTS_RATE_mean,FAULTED_PARTS_RATE_min,FAULTED_PARTS_RATE_max
0,VD101149,KUD1577,6-axis robotic arm,Robotic Arm,KUDAMM CORPORATION,3912.798974,3160.76,4634.45,49.307692,5,96,0.043356,0.0,1.0
1,VD101149,KUD1784,High-precision 10K resistor,Resistor,KUDAMM CORPORATION,1789.776562,1462.33,2180.79,55.125000,4,100,0.038319,0.0,1.0
2,VD101149,KUD2726,RTD temperature sensor,Temperature Sensor,KUDAMM CORPORATION,4143.314107,3347.89,4966.13,52.642857,5,100,0.037035,0.0,1.0
3,VD101149,KUD4331,Industrial control cabinet with locks,Control Cabinet,KUDAMM CORPORATION,4867.812923,4062.61,5961.11,52.784615,5,98,0.043404,0.0,1.0
4,VD101149,KUD4388,High-speed servo motor driver,Servo Motor Driver,KUDAMM CORPORATION,2838.727778,2274.86,3341.26,51.428571,5,100,0.023121,0.0,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,VD996609,GLO8255,Industrial control cabinet with locks,Control Cabinet,GLOBAL MECHATRONICS CONTROL,507.826727,414.14,615.06,62.345455,4,99,0.125287,0.0,1.0
1196,VD996609,GLO8285,6-axis robotic arm,Robotic Arm,GLOBAL MECHATRONICS CONTROL,2797.933750,2272.10,3325.96,48.104167,3,100,0.054814,0.0,1.0
1197,VD996609,GLO8484,High-precision 10K resistor,Resistor,GLOBAL MECHATRONICS CONTROL,736.526042,614.24,886.03,49.833333,6,99,0.022343,0.0,0.1
1198,VD996609,GLO8620,Inductive proximity sensor,Proximity Sensor,GLOBAL MECHATRONICS CONTROL,1413.010800,1141.44,1704.22,49.100000,4,97,0.054182,0.0,1.0


In [93]:
import random 

def generate_certifications():
    if random.random() < 0.05:  # 20% chance to include "Global Certification"
        return ['Global Certification']
    else:
        return random.sample(set_certs - {'Global Certification'}, random.randint(1, len(set_certs) - 1))


In [94]:
unique_suppliercode_itemcodes['CERTIFICATIONS'] = unique_suppliercode_itemcodes.apply(lambda x: generate_certifications(), axis=1)

since Python 3.9 and will be removed in a subsequent version.
  return random.sample(set_certs - {'Global Certification'}, random.randint(1, len(set_certs) - 1))


In [95]:
unique_suppliercode_itemcodes.to_csv('./models/itemset_metrics.csv')

In [98]:
unique_suppliercode_itemcodes[['SUPPLIER_CODE','ITEM_CODE','CERTIFICATIONS']].to_csv('./cleaned_datasets/enlarged_supplier_certifications.csv')