###  Lab Exercise #5 Code

In [33]:
# Importing necessary library
import pandas as pd
import numpy as np

In [34]:
# Loading and Reading CSV file 
pd.reset_option('display.max_rows', None)
transactions = pd.read_csv('inconsistent_transactions.csv') 
transactions


Unnamed: 0,transaction_id,product_id,quantity,price,timestamp
0,T0000,P007,5.0,,2024-10-06
1,T0001,P015,8.0,,2024-11-04
2,T0002,P011,5.0,47.56,10/13/2024
3,T0003,P 008,8.0,37.6,10/28/2024
4,T0004,p007,7.0,40.84,28-10-2024
...,...,...,...,...,...
9995,T9995,p015,2.0,$25.31,2024-10-21
9996,T9996,p016,,15.65,10/11/2024
9997,T9997,P-010,6.0,40.55,22-10-2024
9998,T9998,P006,4.0,26.33,2024-11-01


### Creating a copy of the original transactions data

In [35]:
modified_data = transactions.copy()

### Extending the range of transaction IDS

In [36]:
modified_data['transaction_id'] = modified_data['transaction_id'].str.replace(r'^T','T0', regex=True)
modified_data

Unnamed: 0,transaction_id,product_id,quantity,price,timestamp
0,T00000,P007,5.0,,2024-10-06
1,T00001,P015,8.0,,2024-11-04
2,T00002,P011,5.0,47.56,10/13/2024
3,T00003,P 008,8.0,37.6,10/28/2024
4,T00004,p007,7.0,40.84,28-10-2024
...,...,...,...,...,...
9995,T09995,p015,2.0,$25.31,2024-10-21
9996,T09996,p016,,15.65,10/11/2024
9997,T09997,P-010,6.0,40.55,22-10-2024
9998,T09998,P006,4.0,26.33,2024-11-01


### Enforcing a single Product ID format

In [37]:
pd.reset_option('display.max_rows', None)
modified_data['product_id'] = modified_data['product_id'].str.replace(' ','').str.replace('-','').str.replace('p','P')
modified_data

Unnamed: 0,transaction_id,product_id,quantity,price,timestamp
0,T00000,P007,5.0,,2024-10-06
1,T00001,P015,8.0,,2024-11-04
2,T00002,P011,5.0,47.56,10/13/2024
3,T00003,P008,8.0,37.6,10/28/2024
4,T00004,P007,7.0,40.84,28-10-2024
...,...,...,...,...,...
9995,T09995,P015,2.0,$25.31,2024-10-21
9996,T09996,P016,,15.65,10/11/2024
9997,T09997,P010,6.0,40.55,22-10-2024
9998,T09998,P006,4.0,26.33,2024-11-01


### Changing the price format


In [38]:
pd.reset_option('display.max_rows', None)
modified_data['price'] = modified_data['price'].str.replace('$','')
modified_data['price'] = modified_data['price'].astype(float).round(2)
modified_data

Unnamed: 0,transaction_id,product_id,quantity,price,timestamp
0,T00000,P007,5.0,,2024-10-06
1,T00001,P015,8.0,,2024-11-04
2,T00002,P011,5.0,47.56,10/13/2024
3,T00003,P008,8.0,37.60,10/28/2024
4,T00004,P007,7.0,40.84,28-10-2024
...,...,...,...,...,...
9995,T09995,P015,2.0,25.31,2024-10-21
9996,T09996,P016,,15.65,10/11/2024
9997,T09997,P010,6.0,40.55,22-10-2024
9998,T09998,P006,4.0,26.33,2024-11-01


### Computing the unit prices

In [39]:
## Create a dataframe for rows with complete column values
complete_transactions = modified_data.dropna()



In [40]:
## Extracting row for each unique product
pd.reset_option('display.max_rows', None)
unique_products = complete_transactions.drop_duplicates(subset=['product_id'],keep='first')
unique_products = unique_products.drop(columns=['transaction_id','timestamp'])
unique_products

Unnamed: 0,product_id,quantity,price
2,P011,5.0,47.56
3,P008,8.0,37.6
4,P007,7.0,40.84
5,P019,9.0,26.92
8,P004,7.0,29.39
10,P003,8.0,5.54
11,P002,4.0,22.27
12,P012,5.0,12.17
13,P006,7.0,28.75
15,P001,9.0,11.46


In [41]:
## Computation for unit price
unique_products['unit_price'] = (unique_products['price'] / unique_products['quantity'])
unique_products

Unnamed: 0,product_id,quantity,price,unit_price
2,P011,5.0,47.56,9.512
3,P008,8.0,37.6,4.7
4,P007,7.0,40.84,5.834286
5,P019,9.0,26.92,2.991111
8,P004,7.0,29.39,4.198571
10,P003,8.0,5.54,0.6925
11,P002,4.0,22.27,5.5675
12,P012,5.0,12.17,2.434
13,P006,7.0,28.75,4.107143
15,P001,9.0,11.46,1.273333


### Computing for the missing unit prices and quantities

In [42]:

final_unique_products = unique_products.drop(columns=['quantity','price'])
modified_data = modified_data.merge(final_unique_products, on='product_id', how='left')

modified_data['price'] = modified_data['price'].fillna((modified_data['unit_price'] * modified_data['quantity']).round(2))
modified_data['quantity'] = modified_data['quantity'].fillna((modified_data['price'] / modified_data['unit_price']).apply(np.floor))


In [None]:
# View current modified data
pd.set_option('display.max_rows', None)
modified_data

In [None]:
# View current modified data (without unit price)
pd.set_option('display.max_rows', None)
modified_data = modified_data.drop(columns=['unit_price'])
modified_data

### Adjusting the date format


In [45]:
## Change the / separator to - 
modified_data['timestamp'] = modified_data['timestamp'].str.replace('/','-')


In [None]:
modified_data


### Removing rows with null values for both quantity and price

In [49]:
tentative_change = modified_data.copy()
tentative_change = tentative_change.dropna(subset=['quantity','price'], how='all')
tentative_change['transaction_id'] = ['T{:05d}'.format(i) for i in range(len(tentative_change))]

In [None]:
tentative_change