# Preparing the Data


In [42]:
 # Loading the data

from google.colab import drive
import pandas as pd
import numpy as np

drive.mount('/content/drive')

PO_data=pd.read_csv('/content/drive/MyDrive/NUS-ISS AIS Projects/Project 1/Data/filtered_data_final_2.csv')

# Display the first few rows of the dataset
print('\n First few rows of the PO Dump: \n', PO_data.head(), '\n')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

 First few rows of the PO Dump: 
    Unnamed: 0  PO_NUM DOCCUR  DOCRATE SUPPLIER_CODE  \
0           0   53586    INR      1.0      VD001614   
1           1   53586    INR      1.0      VD001614   
2           2   53650    INR      1.0      VD003065   
3           3   53684    INR      1.0      VD003071   
4           4   53946    INR      1.0      VD002799   

                           SUPPLIER_NAME    DOC_DATE DELIVERY_DATE  \
0  ITG SOFTWARE ENGINEERING (I) PVT. LTD  01-10-2021    30-11-2022   
1  ITG SOFTWARE ENGINEERING (I) PVT. LTD  01-10-2021    30-11-2022   
2    TIKMANY TELESYSTEMS PRIVATE LIMITED  07-10-2021    28-09-2021   
3                       ZIP TECHNOLOGIES  08-10-2021    28-09-2021   
4          INSPIRED TECHNOLOGIES PVT LTD  18-10-2021    30-11-2021   

       PO_VALUE LOCATION  ...  TAX_AMOUNT(LC) LINETOTAL_WITH_TAX_(LC)  \
0  2.764740

Calculating Delivery Time for each line item

In [43]:
#Calculating Delivery Time

# Convert 'DOC_CREATION_DATE' and 'DELIVERY_DATE' to datetime objects
PO_data['DOC_DATE'] = pd.to_datetime(PO_data['DOC_DATE'], format='%d-%m-%Y')
PO_data['DELIVERY_DATE'] = pd.to_datetime(PO_data['DELIVERY_DATE'])

# Calculate delivery time in days
PO_data['DELIVERY_TIME'] = (PO_data['DELIVERY_DATE'] - PO_data['DOC_DATE']).dt.days

PO_data['DELIVERY_TIME'] = PO_data['DELIVERY_TIME'].abs()

# Print delivery time for each vendor
print(PO_data[['SUPPLIER_CODE', 'DELIVERY_TIME']])

      SUPPLIER_CODE  DELIVERY_TIME
0          VD001614            425
1          VD001614            425
2          VD003065              9
3          VD003071             10
4          VD002799             43
...             ...            ...
38967      VD002002             11
38968      VD002002             11
38969      VD002002             11
38970      VD002002             11
38971      VD002002             11

[38972 rows x 2 columns]


  PO_data['DELIVERY_DATE'] = pd.to_datetime(PO_data['DELIVERY_DATE'])


In [44]:
#Checking for improper Delivery times

#Print all vendors with negative delivery time
negative_delivery_time = PO_data[PO_data['DELIVERY_TIME'] < 0]

#Total number of vendors with negative delivery date
print('Total Number of vendors with Negative deliveries:', len(negative_delivery_time['SUPPLIER_CODE'].unique()), '\n')
print(negative_delivery_time['SUPPLIER_CODE'].value_counts())

Total Number of vendors with Negative deliveries: 0 

Series([], Name: count, dtype: int64)


In [46]:
Original_price = PO_data['PRICE']
Original_item_value = PO_data['ITEM_VALUE']
Original_delivery_time = PO_data['DELIVERY_TIME']

# EDA

In [47]:
# observing the shape of the data
print('Shape of the Data is: \t', PO_data.shape, '\n')

Shape of the Data is: 	 (38972, 28) 



In [48]:
# Check the data types and non-null counts
print('\nData Types and Non-null Counts:\t', PO_data.info(), '\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38972 entries, 0 to 38971
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Unnamed: 0                38972 non-null  int64         
 1   PO_NUM                    38972 non-null  int64         
 2   DOCCUR                    38972 non-null  object        
 3   DOCRATE                   38972 non-null  float64       
 4   SUPPLIER_CODE             38972 non-null  object        
 5   SUPPLIER_NAME             38970 non-null  object        
 6   DOC_DATE                  38972 non-null  datetime64[ns]
 7   DELIVERY_DATE             38972 non-null  datetime64[ns]
 8   PO_VALUE                  38972 non-null  float64       
 9   LOCATION                  38972 non-null  object        
 10  ITEM_CODE_CLEANED         38972 non-null  int64         
 11  ITEM_NAME                 38972 non-null  object        
 12  ITEM_DETAILS      

In [49]:
# Check for missing values
print('Missing Values: \n', PO_data.isnull().sum(), '\n')

Missing Values: 
 Unnamed: 0                      0
PO_NUM                          0
DOCCUR                          0
DOCRATE                         0
SUPPLIER_CODE                   0
SUPPLIER_NAME                   2
DOC_DATE                        0
DELIVERY_DATE                   0
PO_VALUE                        0
LOCATION                        0
ITEM_CODE_CLEANED               0
ITEM_NAME                       0
ITEM_DETAILS                  125
UOM                             0
ORDERED_QUANTITY                0
PRICE                           0
ITEM_VALUE                      0
TAX_AMOUNT(LC)                  0
LINETOTAL_WITH_TAX_(LC)         0
PART_ID_CLEANED                 0
PART_NAME                       0
PART_DESCRIPTION              293
WIDTH_(MM)                  11976
HEIGHT_(MM)                 11908
DEPTH_(MM)                  12727
MOUNTING_CLEARANCES_(MM)    38972
WEIGHT_(KG)                 13555
DELIVERY_TIME                   0
dtype: int64 



In [7]:
# Check for duplicate rows
print('Number of duplicate rows: \t', PO_data.duplicated().sum(), '\n')

Number of duplicate rows: 	 0 



In [50]:
#Summary of all the numerical data
print('All the Numerical Features: \n', PO_data.select_dtypes(include=['number']).columns, '\n')

# Summary statistics for numerical features
print('Summary for Numberical Features: \n', PO_data.describe(), '\n')

All the Numerical Features: 
 Index(['Unnamed: 0', 'PO_NUM', 'DOCRATE', 'PO_VALUE', 'ITEM_CODE_CLEANED',
       'ORDERED_QUANTITY', 'PRICE', 'ITEM_VALUE', 'TAX_AMOUNT(LC)',
       'LINETOTAL_WITH_TAX_(LC)', 'PART_ID_CLEANED', 'WIDTH_(MM)',
       'HEIGHT_(MM)', 'DEPTH_(MM)', 'MOUNTING_CLEARANCES_(MM)', 'WEIGHT_(KG)',
       'DELIVERY_TIME'],
      dtype='object') 

Summary for Numberical Features: 
          Unnamed: 0        PO_NUM       DOCRATE  \
count  38972.000000  3.897200e+04  38972.000000   
mean   19485.500000  1.326400e+08     11.108913   
min        0.000000  5.358600e+04      0.531900   
25%     9742.750000  7.353375e+04      1.000000   
50%    19485.500000  2.324308e+08      1.000000   
75%    29228.250000  2.324701e+08      1.000000   
max    38971.000000  2.425306e+08    105.572200   
std    11250.391682  1.156600e+08     27.008198   

                            DOC_DATE                  DELIVERY_DATE  \
count                          38972                          3897

In [51]:
#Summary of all the categorical data
print('All the Categorical Features: \n', PO_data.select_dtypes(include=['object', 'category']).columns, '\n')

# Summary statistics for categorical features
print('Summary for Categorical Features: \n', PO_data.describe(include=['object']), '\n')

All the Categorical Features: 
 Index(['DOCCUR', 'SUPPLIER_CODE', 'SUPPLIER_NAME', 'LOCATION', 'ITEM_NAME',
       'ITEM_DETAILS', 'UOM', 'PART_NAME', 'PART_DESCRIPTION'],
      dtype='object') 

Summary for Categorical Features: 
        DOCCUR SUPPLIER_CODE          SUPPLIER_NAME LOCATION ITEM_NAME  \
count   38972         38972                  38970    38972     38972   
unique      7          1133                   1107       56      2464   
top       INR      VF000363  DIGI-KEY ELECTRONICS.    NOIDA     CABLE   
freq    33941          2661                   2661    25194      2338   

                                             ITEM_DETAILS    UOM PART_NAME  \
count                                               38847  38972     38972   
unique                                               3263     17      1995   
top     <DASH>LFLEX<REGISTERED> 100 POWER AND CONTROL ...    NOS     CABLE   
freq                                                  132  34737      2968   

           

In [52]:
# Standardize the numeric columns

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
PO_data[['PRICE', 'ITEM_VALUE', 'DELIVERY_TIME']] = scaler.fit_transform(PO_data[['PRICE', 'ITEM_VALUE', 'DELIVERY_TIME']])


In [53]:
# Encode categorical variables
PO_data = pd.get_dummies(PO_data, columns=['SUPPLIER_NAME', 'PART_NAME'], drop_first=True)

# Training with Collaborative Fitting with SVD

---



In [54]:
!pip install scikit-surprise



Ranking Vendors out of 10 based on price

In [62]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Prepare data for Surprise library
reader = Reader(rating_scale=(1, 10))  # Adjust rating scale as needed
data = Dataset.load_from_df(PO_data[['SUPPLIER_CODE', 'PRICE', 'ITEM_VALUE']], reader)

# Split data into train and test sets
trainset, testset = train_test_split(data, test_size=.25)

# Train the SVD model
algo = SVD()
algo.fit(trainset)

# Predict vendor ratings based on price
predictions = algo.test(testset)

# Store predictions in a DataFrame
prediction_df = pd.DataFrame(predictions)

# Get predicted ratings
predicted_ratings = prediction_df[['uid', 'est']]

# Group by vendor (uid) and calculate the average predicted rating
vendor_ratings = predicted_ratings.groupby('uid')['est'].mean()


# Scale the average predicted rating to be between 1 and 10
scaler = MinMaxScaler(feature_range=(1, 10))
scaled_vendor_ratings = scaler.fit_transform(vendor_ratings.values.reshape(-1, 1))

# Create a DataFrame with vendor and rating
vendor_ranking = pd.DataFrame({'SUPPLIER_CODE': vendor_ratings.index, 'Vendor_Rating': scaled_vendor_ratings.flatten()})

# Sort vendors based on their rating
vendor_ranking = vendor_ranking.sort_values('Vendor_Rating', ascending=False)

vendor_ranking

Unnamed: 0,SUPPLIER_CODE,Vendor_Rating
125,VD000773,10.000000
171,VD001614,5.858470
114,VD000645,5.187047
60,VD000136,4.161071
32,VCD000123,2.658319
...,...,...
252,VD002449,1.000000
254,VD002460,1.000000
255,VD002461,1.000000
256,VD002465,1.000000


Traning model for vendor recommendation based on price

---



In [63]:
from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise import Dataset
from sklearn.model_selection import train_test_split

y = PO_data['PRICE']
X = PO_data.drop(columns=['PRICE'])

# Split the data (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Create a Surprise Dateset
reader = Reader(rating_scale=(PO_data['PRICE'].min(), PO_data['PRICE'].max()))

# Create the trainset from the training data
train_df = pd.DataFrame({'userID': X_train.index, 'itemID': X_train['PART_ID_CLEANED'], 'rating': y_train})
trainset = Dataset.load_from_df(train_df[['userID', 'itemID', 'rating']], reader)

# Use the SVD algorithm
model = SVD()

# Train the model on the Surprise trainset
model.fit(trainset.build_full_trainset())

# Make predictions - need to create testset similar to trainset
test_df = pd.DataFrame({'userID': X_test.index, 'itemID': X_test['PART_ID_CLEANED'], 'rating': y_test})
testset = Dataset.load_from_df(test_df[['userID', 'itemID', 'rating']], reader).build_full_trainset().build_testset() # create testset
predictions = model.test(testset) # get predictions on the testset

# Evaluate the model
accuracy.mse(predictions)

# Evaluate accuracy
accuracy.rmse(predictions)

MSE: 0.2712
RMSE: 0.5208


0.5207824856491226

In [64]:
# Replace the scaled columns with the original values
PO_data['PRICE'] = Original_price
PO_data['ITEM_VALUE'] = Original_item_value
PO_data['DELIVERY_TIME'] = Original_delivery_time

Get top 3 Vendors for a particular part name on the basis of price

In [65]:
# Get the top 3 vendors for a specific component

#randomly select a part_name
import random

# Get all available column names
available_columns = PO_data.columns

# Check if 'PART_NAME' exists, otherwise, use a different relevant column
part_name_column = 'PART_NAME' if 'PART_NAME' in available_columns else available_columns[0]  # Replace available_columns[0] with an appropriate column if needed

# Randomly select a part_name
part_name = random.choice(PO_data[part_name_column].unique())

print(f"Randomly selected part name: {part_name}")

component_id = PO_data[PO_data[part_name_column] == part_name]['PART_ID_CLEANED'].iloc[0]

recommended_vendors = PO_data[PO_data['PART_ID_CLEANED'] == component_id]
recommended_vendors = recommended_vendors.sort_values(by='PRICE', ascending=True)

# Print the top N vendors for the component
top_vendors = recommended_vendors.head(5)
print(top_vendors[['SUPPLIER_CODE', 'DELIVERY_TIME', 'PRICE', 'DOCCUR']])

recommended_vendors = recommended_vendors.sort_values(by='PRICE', ascending=False)

# Print the top N vendors for the component
top_vendors = recommended_vendors[recommended_vendors[part_name_column] == component_id].head(5)
print(top_vendors[['SUPPLIER_CODE', 'DELIVERY_TIME', 'PRICE', 'DOCCUR']])

Randomly selected part name: 35659
      SUPPLIER_CODE  DELIVERY_TIME  PRICE DOCCUR
122        VD002354            172   65.0    INR
35432      VD001209             41   70.1    INR
35659      VD003552              6   90.0    INR
Empty DataFrame
Columns: [SUPPLIER_CODE, DELIVERY_TIME, PRICE, DOCCUR]
Index: []
