# Preparing the Data


In [16]:
 # Loading the data

from google.colab import drive
import pandas as pd
import numpy as np

drive.mount('/content/drive')

PO_data=pd.read_csv('/content/drive/MyDrive/NUS-ISS AIS Projects/Project 1/Data/enlarged_dataset.csv')

# Display the first few rows of the dataset
print('\n First few rows of the PO Dump: \n', PO_data.head(), '\n')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

 First few rows of the PO Dump: 
    PO_NUM         ITEM_NAME                       PART_DESCRIPTION ITEM_CODE  \
0  100000           Gripper     Pneumatic gripper for robotic arms   MOT7793   
1  100000        3D Printer     Compact 3D printer for prototyping   MOT1546   
2  100001   Pressure Sensor         High-precision pressure sensor   KYO7240   
3  100001  Proximity Sensor             Inductive proximity sensor   KYO2884   
4  100002   Control Cabinet  Industrial control cabinet with locks   IND4956   

                         SUPPLIER_NAME SUPPLIER_CODE  ORDERED_QUANTITY  \
0  MOTHERSON SUMI WIRING INDIA LIMITED      VD904014                39   
1  MOTHERSON SUMI WIRING INDIA LIMITED      VD904014               180   
2         KYOWA MANUFACTURING CO., LTD      VD650721                67   
3         KYOWA MANUFACTURING CO., LTD      VD650721       

Calculating Delivery Time for each line item

In [17]:
#Calculating Delivery Time

# Convert 'DOC_CREATION_DATE' and 'DELIVERY_DATE' to datetime objects
PO_data['DOWNPAYMENT_DATE'] = pd.to_datetime(PO_data['DOWNPAYMENT_DATE'])
PO_data['DELIVERY_DATE'] = pd.to_datetime(PO_data['DELIVERY_DATE'])

# Calculate delivery time in days
PO_data['DELIVERY_TIME'] = (PO_data['DELIVERY_DATE'] - PO_data['DOWNPAYMENT_DATE']).dt.days

PO_data['DELIVERY_TIME'] = PO_data['DELIVERY_TIME'].abs()

# Print delivery time for each vendor
print(PO_data[['SUPPLIER_CODE', 'DELIVERY_TIME']])

      SUPPLIER_CODE  DELIVERY_TIME
0          VD904014             21
1          VD904014             38
2          VD650721             37
3          VD650721             27
4          VD671527             99
...             ...            ...
58662      VD492539             16
58663      VD492539              6
58664      VD406065             26
58665      VD994773             40
58666      VD994773             39

[58667 rows x 2 columns]


In [18]:
#Checking for improper Delivery times

#Print all vendors with negative delivery time
negative_delivery_time = PO_data[PO_data['DELIVERY_TIME'] < 0]

#Total number of vendors with negative delivery date
print('Total Number of vendors with Negative deliveries:', len(negative_delivery_time['SUPPLIER_CODE'].unique()), '\n')
print(negative_delivery_time['SUPPLIER_CODE'].value_counts())

Total Number of vendors with Negative deliveries: 0 

Series([], Name: count, dtype: int64)


In [19]:
Original_price = PO_data['PRICE']
#Original_item_value = PO_data['ITEM_VALUE']
Original_delivery_time = PO_data['DELIVERY_TIME']

# EDA

In [20]:
# observing the shape of the data
print('Shape of the Data is: \t', PO_data.shape, '\n')

Shape of the Data is: 	 (58667, 13) 



In [21]:
# Check the data types and non-null counts
print('\nData Types and Non-null Counts:\t', PO_data.info(), '\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58667 entries, 0 to 58666
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   PO_NUM            58667 non-null  int64         
 1   ITEM_NAME         58667 non-null  object        
 2   PART_DESCRIPTION  58667 non-null  object        
 3   ITEM_CODE         58667 non-null  object        
 4   SUPPLIER_NAME     58667 non-null  object        
 5   SUPPLIER_CODE     58667 non-null  object        
 6   ORDERED_QUANTITY  58667 non-null  int64         
 7   FAULTED_PARTS     58667 non-null  int64         
 8   PRICE             58667 non-null  float64       
 9   PO_VALUE          58667 non-null  float64       
 10  DOWNPAYMENT_DATE  58667 non-null  datetime64[ns]
 11  DELIVERY_DATE     58667 non-null  datetime64[ns]
 12  DELIVERY_TIME     58667 non-null  int64         
dtypes: datetime64[ns](2), float64(2), int64(4), object(5)
memory usage: 5.8+ MB


In [22]:
# Check for missing values
print('Missing Values: \n', PO_data.isnull().sum(), '\n')

Missing Values: 
 PO_NUM              0
ITEM_NAME           0
PART_DESCRIPTION    0
ITEM_CODE           0
SUPPLIER_NAME       0
SUPPLIER_CODE       0
ORDERED_QUANTITY    0
FAULTED_PARTS       0
PRICE               0
PO_VALUE            0
DOWNPAYMENT_DATE    0
DELIVERY_DATE       0
DELIVERY_TIME       0
dtype: int64 



In [23]:
# Check for duplicate rows
print('Number of duplicate rows: \t', PO_data.duplicated().sum(), '\n')

Number of duplicate rows: 	 0 



In [24]:
#Summary of all the numerical data
print('All the Numerical Features: \n', PO_data.select_dtypes(include=['number']).columns, '\n')

# Summary statistics for numerical features
print('Summary for Numberical Features: \n', PO_data.describe(), '\n')

All the Numerical Features: 
 Index(['PO_NUM', 'ORDERED_QUANTITY', 'FAULTED_PARTS', 'PRICE', 'PO_VALUE',
       'DELIVERY_TIME'],
      dtype='object') 

Summary for Numberical Features: 
               PO_NUM  ORDERED_QUANTITY  FAULTED_PARTS         PRICE  \
count   58667.000000      58667.000000   58667.000000  58667.000000   
mean   115010.739496        150.165408       5.705882   2563.427641   
min    100000.000000          1.000000       0.000000     41.350000   
25%    107535.000000         75.000000       0.000000   1300.845000   
50%    114965.000000        150.000000       0.000000   2575.440000   
75%    122516.500000        225.000000       3.000000   3729.720000   
max    129999.000000        300.000000     300.000000   5987.200000   
std      8657.877555         86.742675      25.070171   1454.977483   

           PO_VALUE               DOWNPAYMENT_DATE  \
count  5.866700e+04                          58667   
mean   3.847127e+05  2023-07-02 15:11:08.798472448   
min    6.

In [25]:
#Summary of all the categorical data
print('All the Categorical Features: \n', PO_data.select_dtypes(include=['object', 'category']).columns, '\n')

# Summary statistics for categorical features
print('Summary for Categorical Features: \n', PO_data.describe(include=['object']), '\n')

All the Categorical Features: 
 Index(['ITEM_NAME', 'PART_DESCRIPTION', 'ITEM_CODE', 'SUPPLIER_NAME',
       'SUPPLIER_CODE'],
      dtype='object') 

Summary for Categorical Features: 
         ITEM_NAME                            PART_DESCRIPTION ITEM_CODE  \
count       58667                                       58667     58667   
unique         15                                          15      1199   
top     VFD Drive  Variable Frequency Drive for motor control   PRA4679   
freq         4506                                        4506        81   

                           SUPPLIER_NAME SUPPLIER_CODE  
count                              58667         58667  
unique                                80            80  
top     Praja Controls & Systems Pvt Ltd      VD899831  
freq                                 828           828   



In [26]:
# Standardize the numeric columns

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
PO_data[['PRICE','DELIVERY_TIME']] = scaler.fit_transform(PO_data[['PRICE','DELIVERY_TIME']])


In [27]:
# Encode categorical variables

#Rename part description as part name
PO_data = PO_data.rename(columns={'PART_DESCRIPTION': 'PART_NAME'})

PO_data = pd.get_dummies(PO_data, columns=['SUPPLIER_NAME', 'PART_NAME'], drop_first=True)

# Training with Collaborative Fitting with SVD

In [28]:
!pip install scikit-surprise



Ranking Vendors out of 10 based on Delivery Times

In [29]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Prepare data for Surprise library
reader = Reader(rating_scale=(1, 10))  # Assuming vendor ranking is from 1 to 10

data = PO_data[['SUPPLIER_CODE', 'DELIVERY_TIME', 'PRICE']]
data = data.rename(columns={'SUPPLIER_CODE': 'userID', 'DELIVERY_TIME': 'itemID', 'PRICE': 'rating'})

dataset = Dataset.load_from_df(data[['userID', 'itemID', 'rating']], reader)
trainset, testset = train_test_split(dataset, test_size=.25)

# Train SVD model
algo = SVD()
algo.fit(trainset)

# Predict ratings for all vendor-delivery_time combinations
predictions = algo.test(testset)

# Create a DataFrame to store predicted ratings
predicted_ratings_df = pd.DataFrame(columns=['SUPPLIER_CODE', 'DELIVERY_TIME', 'predicted_rating'])
for uid, iid, true_r, est, _ in predictions:
    predicted_ratings_df = pd.concat([predicted_ratings_df, pd.DataFrame({'SUPPLIER_CODE': [uid], 'DELIVERY_TIME': [iid], 'predicted_rating': [est]})], ignore_index=True)


# Scale the predicted ratings to be between 1 and 10
scaler = MinMaxScaler(feature_range=(1, 10))
predicted_ratings_df['scaled_rating'] = scaler.fit_transform(predicted_ratings_df[['predicted_rating']])

# Group by vendor and calculate the average predicted rating
vendor_ratings = predicted_ratings_df.groupby('SUPPLIER_CODE')['scaled_rating'].mean()

# Sort vendors by their average predicted rating in descending order
ranked_vendors = vendor_ratings.sort_values(ascending=False)


ranked_vendors

  predicted_ratings_df = pd.concat([predicted_ratings_df, pd.DataFrame({'SUPPLIER_CODE': [uid], 'DELIVERY_TIME': [iid], 'predicted_rating': [est]})], ignore_index=True)


Unnamed: 0_level_0,scaled_rating
SUPPLIER_CODE,Unnamed: 1_level_1
VD268662,1.259603
VD184670,1.018239
VD101149,1.000000
VD736394,1.000000
VD784623,1.000000
...,...
VD439383,1.000000
VD427107,1.000000
VD424465,1.000000
VD412248,1.000000


Traning model for vendor recommendation based on Delivery Time


In [30]:
from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise import Dataset
from sklearn.model_selection import train_test_split

y = PO_data['DELIVERY_TIME']
X = PO_data.drop(columns=['DELIVERY_TIME'])

# Split the data (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Create a Surprise Dateset
reader = Reader(rating_scale=(PO_data['DELIVERY_TIME'].min(), PO_data['DELIVERY_TIME'].max()))

# Create the trainset from the training data
train_df = pd.DataFrame({'userID': X_train.index, 'itemID': X_train['ITEM_CODE'], 'rating': y_train})
trainset = Dataset.load_from_df(train_df[['userID', 'itemID', 'rating']], reader)

# Use the SVD algorithm
model = SVD()

# Train the model on the Surprise trainset
model.fit(trainset.build_full_trainset())

# Make predictions - need to create testset similar to trainset
test_df = pd.DataFrame({'userID': X_test.index, 'itemID': X_test['ITEM_CODE'], 'rating': y_test})
testset = Dataset.load_from_df(test_df[['userID', 'itemID', 'rating']], reader).build_full_trainset().build_testset() # create testset
predictions = model.test(testset) # get predictions on the testset

# Evaluate the model
accuracy.mse(predictions)

# Evaluate accuracy
accuracy.rmse(predictions)

MSE: 1.0302
RMSE: 1.0150


1.0150061175789578

In [31]:
# Replace the scaled columns with the original values
PO_data['PRICE'] = Original_price
#PO_data['ITEM_VALUE'] = Original_item_value
PO_data['DELIVERY_TIME'] = Original_delivery_time

Get top 3 Vendors for a particular part name on the basis of delivery time

In [34]:
# Get the top 3 vendors for a specific component

#randomly select a part_name
import random

# Get all available column names
available_columns = PO_data.columns

# Check if 'PART_NAME' exists, otherwise, use a different relevant column
part_name_column = 'PART_NAME' if 'PART_NAME' in available_columns else available_columns[0]  # Replace available_columns[0] with an appropriate column if needed

# Randomly select a part_name
part_name = random.choice(PO_data[part_name_column].unique())

print(f"Randomly selected part name: {part_name}")

component_id = PO_data[PO_data[part_name_column] == part_name]['ITEM_CODE'].iloc[0]

recommended_vendors = PO_data[PO_data['ITEM_CODE'] == component_id]
recommended_vendors = recommended_vendors.sort_values(by='DELIVERY_TIME', ascending=True)

# Print the top N vendors for the component
top_vendors = recommended_vendors.head(3)
print(top_vendors[['SUPPLIER_CODE', 'DELIVERY_TIME', 'PRICE']])


Randomly selected part name: 124481
      SUPPLIER_CODE  DELIVERY_TIME   PRICE
35972      VD738866              3  528.10
47875      VD738866              6  433.28
41329      VD738866              7  366.45
