In [23]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = './tvae_synthetic_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

# Check for missing values
missing_values = data.isnull().sum()

# print column names
print(data.columns)

Index(['#', 'DOCRATE', 'PO_NUM', 'BASE_NUMBER', 'BASE_LINE', 'PO_VALUE',
       'ORDERED_QUANTITY', 'PRICE', 'ITEM_VALUE', 'FREIGHT', 'TAX_AMOUNT(LC)',
       'LINETOTAL_WITH_TAX_(LC)', 'BALANCE_QUNATITY', 'OPEN_QTY_IN_OPENING',
       'OPEN_ITEM_VALUE', 'ORDER_QTY', 'GRN_/_SHIPMENT_QTY',
       'GRN_/_SHIPMENT_VAL', 'DOWNPAYMENT_VALUE', 'PAID_AMOUNT', 'OPEN_GR_QTY',
       'GR_OPEN_VALUE', 'COST_SAVING', 'FORCE_CLOSED_PO_QTY', 'PART_ID',
       'PART_REVISION', 'PRODUCT_GROUP', 'PRODUCT_SUBGROUP', 'WIDTH_(MM)',
       'HEIGHT_(MM)', 'DEPTH_(MM)', 'WEIGHT_(KG)', 'DOCTYPE', 'CANCELED',
       'DOCCUR', 'DOCSTATUS', 'BASE_TYPE', 'SUPPLIER_NAME', 'SUPPLIER_CODE',
       'DOC_DATE', 'DELIVERY_DATE', 'BUYER', 'BP_PROJECT_CODE',
       'BP_PROJECT_NAME', 'LINE_PROJECT_CODE', 'LINE_PROJECT_NAME', 'LOCATION',
       'ITEM_CODE', 'ITEM_NAME', 'FG_ITEM_DESCRIPTION', 'ITEM_DETAILS',
       'MANUFACTURER_PART_NO', 'HSN', 'UOM', 'TAX_CODE', 'USER_CODE',
       'DOWNPAYMENT_DATE', 'MANUALLY_CLOSED_P

In [24]:
# print the shape of the dataset
print("Dataset Shape: ", data.shape)

# print the data types of the columns
print(data.dtypes)

# print the summary statistics of the dataset
print(data.describe())


Dataset Shape:  (50000, 75)
#                             int64
DOCRATE                     float64
PO_NUM                        int64
BASE_NUMBER                   int64
BASE_LINE                     int64
                             ...   
MANUFACTURER_PART_NUMBER     object
EPLAN_PART_NUMBER            object
RELEASED_STATUS              object
RELEASED_DATE                object
PART_CATEGORY                object
Length: 75, dtype: object
                  #       DOCRATE        PO_NUM   BASE_NUMBER     BASE_LINE  \
count  50000.000000  50000.000000  5.000000e+04  5.000000e+04  50000.000000   
mean   27940.460120      3.791708  1.534599e+08  1.980236e+08      2.443860   
std    15946.595956     14.883865  1.103219e+08  8.285468e+07      1.967589   
min        9.000000      0.725667  5.358600e+04  1.000080e+05      1.000000   
25%    14735.250000      0.955578  4.521165e+05  2.320370e+08      1.000000   
50%    28494.000000      0.998351  2.320291e+08  2.323268e+08      2.000000 

In [36]:
focus_columns = ['PO_NUM', 'ITEM_CODE', 'ITEM_NAME','SUPPLIER_NAME', 'SUPPLIER_CODE','ORDERED_QUANTITY', 'PRICE', 'PO_VALUE', 'DOWNPAYMENT_DATE','DELIVERY_DATE', 'PART_DESCRIPTION']
df = data[focus_columns]
print(df.shape)
df[0:].head(10)

(50000, 11)


Unnamed: 0,PO_NUM,ITEM_CODE,ITEM_NAME,SUPPLIER_NAME,SUPPLIER_CODE,ORDERED_QUANTITY,PRICE,PO_VALUE,DOWNPAYMENT_DATE,DELIVERY_DATE,PART_DESCRIPTION
0,53586,14873,wire,SECURITAS ENGINEERS,VD000471,275.9,209.326401,0.0,2/27/2023,6/10/2023,"Power and control cable; 3G4; 11.1 mm; 1,1 kV;..."
1,233001368,2744400,ELCB,SECURITAS ENGINEERS,VD000471,57.8,3886.681121,136381.0189,2/27/2023,11/30/2023,Earth leakage circuit breaker - DPN N Vigi - 1...
2,977417,2277700,Ethernet I/O module,TAC Automation Pvt.Ltd- UP,VD002456,16.2,10244.52224,249007.2534,2/27/2023,2/2/2023,Compact IP20 multiprotocol Ethernet I/O module...
3,53586,18054,Cable,IGUS INDIA PRIVATE LIMITED,VD000478,50.0,1300.563242,401538.8736,2/27/2023,3/30/2023,DB25 male breakout cable with DB9 female and f...
4,233033749,2656800,BUSBAR ALUMINUM,MEERA ENTERPRISES,VD003901,9.7,0.01,0.0,2/27/2023,2/10/2024,BUSBAR 25X6MM ALUMINUM 2.5 MTR HINDALCO
5,53586,2651200,Remote IO Module,SICK INDIA Pvt. Ltd.,VD000068,36.6,131.818981,250077.7687,2/27/2023,9/16/2022,"Junction box M12, ports 8 SBL-08D12-KC05 SENS...."
6,53586,15215,Signal cable pre-assembled,Diagnostic Automation & Controls Pvt. Ltd.,VD000915,0.0,1787.705356,0.0,2/27/2023,9/18/2022,Power cable pre-assembled 6FX3002-5CK01-1BA0 4...
7,232558996,5046100,CONNECTOR - DSUB15,NETWORK ELECTRONICS,VF003119,12.7,0.01,28219.83158,2/27/2023,5/20/2024,"Low Profile Right Angle Adapter, HD15 Male / F..."
8,701936,14972,Antenna,Diagnostic Automation & Controls Pvt. Ltd.,VD000915,5.5,0.01,141159.1195,2/27/2023,8/10/2022,SOMANET Circulo 9 IO Cable Kit (S-042_O-04)
9,231513996,18128,Relay,SUGI ELECTRONICS LLP,VD003613,15.5,1560.619156,0.0,2/27/2023,11/15/2023,"Monitoring Safety Relay, Dual Input, 2 Dual Ch..."


In [26]:
# data cleaning

# lower case ITEM_NAME values and remove leading/trailing whitespaces
df['ITEM_NAME'] = df['ITEM_NAME'].str.lower().str.strip()

# lower case SUPPLIER_NAME values and remove leading/trailing whitespaces
df['SUPPLIER_NAME'] = df['SUPPLIER_NAME'].str.lower().str.strip()

# change PRICE float precision to 2 decimal places
df['PRICE'] = df['PRICE'].round(2)

# change PO_VALUE float precision to 2 decimal places
df['PO_VALUE'] = df['PO_VALUE'].round(2)

# convert DOC_DATE and DELIVERY_DATE to datetime
df['DOWNPAYMENT_DATE'] = pd.to_datetime(df['DOWNPAYMENT_DATE'])
df['DELIVERY_DATE'] = pd.to_datetime(df['DELIVERY_DATE'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ITEM_NAME'] = df['ITEM_NAME'].str.lower().str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SUPPLIER_NAME'] = df['SUPPLIER_NAME'].str.lower().str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PRICE'] = df['PRICE'].round(2)
A value is trying to be set on a copy of a sli

In [27]:
# basic data exploration
# check for unique values in the columns
unique_values = df.nunique()

print(unique_values)

PO_NUM              39871
ITEM_CODE            1607
ITEM_NAME             832
SUPPLIER_NAME         163
SUPPLIER_CODE         161
ORDERED_QUANTITY     3658
PRICE               37485
PO_VALUE            36791
DOWNPAYMENT_DATE       22
DELIVERY_DATE         475
dtype: int64


In [28]:
# check for duplicate rows
duplicate_rows = df.duplicated().sum()
print("Number of duplicate rows: ", duplicate_rows)

Number of duplicate rows:  0


In [29]:
df.describe()

Unnamed: 0,PO_NUM,ORDERED_QUANTITY,PRICE,PO_VALUE,DOWNPAYMENT_DATE,DELIVERY_DATE
count,50000.0,50000.0,50000.0,50000.0,50000,50000
mean,153459900.0,66.659768,4085.264331,462728.9,2023-02-27 05:01:45.984000,2023-09-27 22:57:23.328000
min,53586.0,0.0,0.01,0.0,2022-12-08 00:00:00,2022-01-30 00:00:00
25%,452116.5,3.0,147.705,0.0,2023-02-27 00:00:00,2023-04-28 00:00:00
50%,232029100.0,23.6,1314.925,152842.5,2023-02-27 00:00:00,2023-12-15 00:00:00
75%,232612300.0,46.2,2736.225,316042.1,2023-02-27 00:00:00,2024-02-22 00:00:00
max,242530600.0,17072.5,435095.76,152106500.0,2024-05-14 00:00:00,2024-12-31 00:00:00
std,110321900.0,399.006017,12758.957025,5298642.0,,


In [30]:
# calculate the number of suppliers for a different ITEM NAMEs
supplier_count = df.groupby('ITEM_NAME')['SUPPLIER_NAME'].nunique()
print(supplier_count)



ITEM_NAME
0.25 sq/mm wire white/blue                                                3
0.5 sq.mm single core cable - blue, lapp                                  2
0.5 sq/mm wire white/blue                                                 5
0.75 sq/mm wire white/blue                                                2
16 digital outputs, remote, modular i/o system                            2
                                                                         ..
wireharness battery                                                       4
wireharness circuit                                                       2
wiring duct 25*45                                                         4
y connector                                                               1
y-link for connection of single-channel dp slaves to s7-400h & as 410h    1
Name: SUPPLIER_NAME, Length: 832, dtype: int64


In [31]:
# print items in the order of the highest number of suppliers
supplier_count = supplier_count.sort_values(ascending=False)
print(supplier_count)

ITEM_NAME
contactor                                                                 61
drive                                                                     47
limit switch                                                              45
relay                                                                     40
controller                                                                40
                                                                          ..
heat sink                                                                  1
classic 100 0.75 sqmm triple core pvc control cable                        1
hybrid motor starter reversing control load :2.4a 3ph                      1
software-server10c                                                         1
y-link for connection of single-channel dp slaves to s7-400h & as 410h     1
Name: SUPPLIER_NAME, Length: 832, dtype: int64


In [32]:
# replace the values in the DOWNPAYMENT_DATE column with random dates from 2021-01-01 to 2023-12-31
import random
import datetime

start_date = datetime.date(2021, 1, 1)
end_date = datetime.date(2023, 12, 31)

df['DOWNPAYMENT_DATE'] = [random.choice(pd.date_range(start_date, end_date)) for _ in range(len(df))]
df['DOWNPAYMENT_DATE'] = pd.to_datetime(df['DOWNPAYMENT_DATE'])

# replace the values in the DELIVERY_DATE column with dates with a range of 1 to 150 days from the DOWNPAYMENT_DATE
df['DELIVERY_DATE'] =  df['DOWNPAYMENT_DATE'].apply((lambda x: x+ datetime.timedelta(days=random.randint(1, 90))))  
df['DELIVERY_DATE'] = pd.to_datetime(df['DELIVERY_DATE'])


# add a column days to delivery
df['DAYS_TO_DELIVERY'] = (df['DELIVERY_DATE'] - df['DOWNPAYMENT_DATE']).dt.days

# check for negative values in the days_to_delivery column
negative_days = df[df['DAYS_TO_DELIVERY'] < 0]
print(negative_days)


Empty DataFrame
Columns: [PO_NUM, ITEM_CODE, ITEM_NAME, SUPPLIER_NAME, SUPPLIER_CODE, ORDERED_QUANTITY, PRICE, PO_VALUE, DOWNPAYMENT_DATE, DELIVERY_DATE, DAYS_TO_DELIVERY]
Index: []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOWNPAYMENT_DATE'] = [random.choice(pd.date_range(start_date, end_date)) for _ in range(len(df))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOWNPAYMENT_DATE'] = pd.to_datetime(df['DOWNPAYMENT_DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DELIVERY_DATE'] =  df['DOWNPAYMENT_

In [37]:
# text processing on the PART_DESCRIPTION column
import re 
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.corpus import wordnet

def lower_casing(sentence):
    new_sentence = sentence.lower()
    return new_sentence

def punctuation_removal(sentence):
    # Remove the all the punctuations except '
    new_sentence = re.sub(',|!|\?|\"|<|>|\(|\)|\[|\]|\{|\}|@|#|\+|\=|\-|\_|~|\&|\*|\^|%|\||\$|/|`|\.|\'',
                          '', sentence,count=0, flags=0)
    return new_sentence

def stopword_removal(sentence):
    stoplist = stopwords.words('english')
    new_sentence = [word for word in sentence if word not in stoplist]
    return new_sentence

def get_wordnet_pos(word):
    pack = nltk.pos_tag([word])
    tag = pack[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
def lemmatization(sentence):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    new_sentence = [lemmatizer.lemmatize(word, get_wordnet_pos(word) or wordnet.NOUN) for word in sentence]

    return new_sentence

def tokenization(sentence):
    new_sentence = nltk.word_tokenize(sentence)
    return new_sentence


def text_preprocessing(raw_sentence):
    sentence = lower_casing(raw_sentence)
    sentence = punctuation_removal(sentence)
    sentence = tokenization(sentence)
    sentence = stopword_removal(sentence)
    sentence = lemmatization(sentence)
    return sentence

# apply the text preprocessing function to the PART_DESCRIPTION column
df['PART_DESCRIPTION'] = df['PART_DESCRIPTION'].apply(text_preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PART_DESCRIPTION'] = df['PART_DESCRIPTION'].apply(text_preprocessing)


In [33]:
# calculate the average days to delivery for each item by a particular supplier
# group by ITEM_NAME and SUPPLIER_NAME and calculate the mean of the days_to_delivery column
# reset the index to make the groupby columns regular columns
# rename the columns

avg_days_to_delivery = df.groupby(['ITEM_NAME', 'SUPPLIER_NAME'])['DAYS_TO_DELIVERY'].mean().reset_index()
avg_days_to_delivery.columns = ['ITEM_NAME', 'SUPPLIER_NAME', 'AVG_DAYS_TO_DELIVERY']
avg_days_to_delivery.head(10)


Unnamed: 0,ITEM_NAME,SUPPLIER_NAME,AVG_DAYS_TO_DELIVERY
0,0.25 sq/mm wire white/blue,bhl power system pvt ltd,56.333333
1,0.25 sq/mm wire white/blue,phoenix contact india pvt ltd,35.0
2,0.25 sq/mm wire white/blue,securitas engineers,19.5
3,"0.5 sq.mm single core cable - blue, lapp",phoenix contact india pvt ltd,26.0
4,"0.5 sq.mm single core cable - blue, lapp",securitas engineers,31.333333
5,0.5 sq/mm wire white/blue,bhl power system pvt ltd,33.0
6,0.5 sq/mm wire white/blue,digi-key electronics.,17.0
7,0.5 sq/mm wire white/blue,kinco electricshenzhenltd.,4.0
8,0.5 sq/mm wire white/blue,rd energy solutions,29.0
9,0.5 sq/mm wire white/blue,securitas engineers,69.8


In [39]:
df.to_csv('tvae_synthetic_dataset_cleaned.csv', index=True)