In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_raw = pd.read_csv('data/dirty_cafe_sales.csv', sep=';')

In [3]:
item_price_mapping = {
    'Coffee': 2.0,
    'Tea': 1.5,
    'Sandwich': 4.5, #
    'Salad': 5.0,
    'Cake': 3.5, #
    'Cookie': 1.0,
    'Smoothie': 4.0,
    'Juice': 3.0
}

In [4]:
df_raw.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1005331,Coffee,1,2,2,Digital Wallet,Takeaway,04/11/23
1,TXN_1005472,Coffee,4,2,8,Credit Card,,21/04/23
2,TXN_1016246,Coffee,1,2,2,ERROR,,19/01/23
3,TXN_1020478,Coffee,1,2,2,Digital Wallet,Takeaway,09/03/23
4,TXN_1040764,Coffee,3,2,6,Cash,Takeaway,27/07/23


In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    10000 non-null  object
 1   Item              9667 non-null   object
 2   Quantity          9862 non-null   object
 3   Price Per Unit    9822 non-null   object
 4   Total Spent       9850 non-null   object
 5   Payment Method    7421 non-null   object
 6   Location          6735 non-null   object
 7   Transaction Date  9841 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB


In [6]:
df_raw.describe(include='all')

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
count,10000,9667,9862,9822,9850,7421,6735,9841
unique,10000,10,7,10,28,5,4,367
top,TXN_1005331,Juice,5,3,6,Digital Wallet,Takeaway,UNKNOWN
freq,1,1171,2013,1344,752,2291,3022,159


**Check Missing Values**

In [61]:
df = df_raw.copy()
df.isnull().sum()

Transaction ID         0
Item                 333
Quantity             138
Price Per Unit       178
Total Spent          150
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [63]:
df[df['Item'].isnull()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
7397,TXN_1054915,,5,1,5,,In-store,22/01/23
7398,TXN_1124900,,4,4,16,Credit Card,In-store,08/09/23
7399,TXN_1165762,,3,2,6,Credit Card,,22/10/23
7400,TXN_1166001,,UNKNOWN,3,15,Cash,ERROR,
7401,TXN_1205610,,2,1,2,,In-store,19/07/23
...,...,...,...,...,...,...,...,...
7725,TXN_9774251,,2,3,6,Digital Wallet,Takeaway,16/06/23
7726,TXN_9810581,,1,3,3,Cash,,20/11/23
7727,TXN_9817602,,2,4,8,Credit Card,Takeaway,26/12/23
7728,TXN_9945729,,2,5,10,Digital Wallet,,03/02/23


The Missing Item Name seems still having Price Per Unit. This Price per unit could be as a clue to linked back to the Item Name using a dictionary called <b> item_price_mapping </b>

In [64]:
# Reverse the mapping, into price: item_name
price_item_mapping = {price: item for item, price in item_price_mapping.items()}

In [65]:
price_item_mapping

{2.0: 'Coffee',
 1.5: 'Tea',
 4.5: 'Sandwich',
 5.0: 'Salad',
 3.5: 'Cake',
 1.0: 'Cookie',
 4.0: 'Smoothie',
 3.0: 'Juice'}

In [66]:
df['Item'].value_counts()

Item
Juice       1171
Coffee      1165
Salad       1148
Cake        1139
Sandwich    1131
Smoothie    1096
Cookie      1092
Tea         1089
UNKNOWN      344
ERROR        292
Name: count, dtype: int64

In [67]:
df['Item'] = df['Item'].fillna(df['Price Per Unit'].map(price_item_mapping))

In [68]:
df['Item'].value_counts()

Series([], Name: count, dtype: int64)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    10000 non-null  object
 1   Item              0 non-null      object
 2   Quantity          9862 non-null   object
 3   Price Per Unit    9822 non-null   object
 4   Total Spent       9850 non-null   object
 5   Payment Method    7421 non-null   object
 6   Location          6735 non-null   object
 7   Transaction Date  9841 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB


In [70]:
df.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1005331,,1,2,2,Digital Wallet,Takeaway,04/11/23
1,TXN_1005472,,4,2,8,Credit Card,,21/04/23
2,TXN_1016246,,1,2,2,ERROR,,19/01/23
3,TXN_1020478,,1,2,2,Digital Wallet,Takeaway,09/03/23
4,TXN_1040764,,3,2,6,Cash,Takeaway,27/07/23


Turns out the Item Name become <b>NaN</b>, seems the did not find the reference of mapping value. Looking back to the data type of Price Per Unit that still in object might be the root cause. Let's check the Price Per Unit value_counts(), but beofre that let's replace df with df_raw

In [71]:
df = df_raw.copy()
df['Price Per Unit'].value_counts()

Price Per Unit
3          1344
4          1249
2          1227
5          1204
1          1143
1.5        1133
3.5        1085
4.5        1083
ERROR       190
UNKNOWN     164
Name: count, dtype: int64

There are two suspicious values (ERROR and UNKNOWN) among at those numeric Vals. This column should be in float as it is Price. Let's handle those vals before convert it into float 

In [72]:
df['Price Per Unit'] = df['Price Per Unit'].replace(['ERROR', 'UNKNOWN'], np.nan)
df['Price Per Unit'].value_counts()

Price Per Unit
3      1344
4      1249
2      1227
5      1204
1      1143
1.5    1133
3.5    1085
4.5    1083
Name: count, dtype: int64

In [73]:
df['Price Per Unit'] = df['Price Per Unit'].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction ID    10000 non-null  object 
 1   Item              9667 non-null   object 
 2   Quantity          9862 non-null   object 
 3   Price Per Unit    9468 non-null   float64
 4   Total Spent       9850 non-null   object 
 5   Payment Method    7421 non-null   object 
 6   Location          6735 non-null   object 
 7   Transaction Date  9841 non-null   object 
dtypes: float64(1), object(7)
memory usage: 625.1+ KB


Based on above information, the Price Per Unit are successfully converted into float64. Now let's get back to handling Missing Value on <b>Item</b>

In [74]:
# Replace missing on Item with price_item_mapping
df['Item'] = df['Item'].fillna(df['Price Per Unit'].map(price_item_mapping))
df.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1005331,Coffee,1,2.0,2,Digital Wallet,Takeaway,04/11/23
1,TXN_1005472,Coffee,4,2.0,8,Credit Card,,21/04/23
2,TXN_1016246,Coffee,1,2.0,2,ERROR,,19/01/23
3,TXN_1020478,Coffee,1,2.0,2,Digital Wallet,Takeaway,09/03/23
4,TXN_1040764,Coffee,3,2.0,6,Cash,Takeaway,27/07/23


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction ID    10000 non-null  object 
 1   Item              9977 non-null   object 
 2   Quantity          9862 non-null   object 
 3   Price Per Unit    9468 non-null   float64
 4   Total Spent       9850 non-null   object 
 5   Payment Method    7421 non-null   object 
 6   Location          6735 non-null   object 
 7   Transaction Date  9841 non-null   object 
dtypes: float64(1), object(7)
memory usage: 625.1+ KB


Total Non-Null values at Item increased from 9667 to 9977. Let's ensure the remaining missing values 

In [76]:
df.isnull().sum()

Transaction ID         0
Item                  23
Quantity             138
Price Per Unit       532
Total Spent          150
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [79]:
df[df['Item'].isnull()]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
7402,TXN_1208561,,ERROR,,20,Credit Card,,19/08/23
7448,TXN_2523298,,4,,6,ERROR,In-store,25/03/23
7449,TXN_2536573,,2,,8,Cash,In-store,24/06/23
7473,TXN_3334632,,1,,2,Credit Card,Takeaway,20/11/23
7480,TXN_3495950,,4,,6,Credit Card,In-store,19/02/23
7486,TXN_3611851,,4,,ERROR,Credit Card,,09/02/23
7500,TXN_3803063,,4,,12,Credit Card,Takeaway,23/11/23
7509,TXN_4031509,,4,,16,Credit Card,Takeaway,04/01/23
7514,TXN_4208919,,3,,12,,Takeaway,30/05/23
7543,TXN_4844386,,5,,15,Credit Card,In-store,28/10/23


Remaining missing values at Item happened because NaN at Price Per Unit, which is valid and reasonable. 

**Handling Inappropriate Values**

In [12]:
df_raw['Item'].value_counts()

Item
Juice       1171
Coffee      1165
Salad       1148
Cake        1139
Sandwich    1131
Smoothie    1096
Cookie      1092
Tea         1089
UNKNOWN      344
ERROR        292
Name: count, dtype: int64

Apparently there are two suspicious Item Name which are UNKNOWN and ERROR. Better to replace those values with <b>Others</b>

In [15]:
df = df_raw.copy()

In [16]:
df['Item'] = df['Item'].replace({'UNKNOWN':'Others', 'ERROR':'Others'})

In [17]:
df['Item'].value_counts()

Item
Juice       1171
Coffee      1165
Salad       1148
Cake        1139
Sandwich    1131
Smoothie    1096
Cookie      1092
Tea         1089
Others       636
Name: count, dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    10000 non-null  object
 1   Item              9667 non-null   object
 2   Quantity          9862 non-null   object
 3   Price Per Unit    9821 non-null   object
 4   Total Spent       9827 non-null   object
 5   Payment Method    7421 non-null   object
 6   Location          6735 non-null   object
 7   Transaction Date  9841 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB
