In [20]:
# import necessary library
import pandas as pd
import random

## 1. Hello, Data 
* Load 1000 sales record file and just store 500 data set in data frame
* show only first 3 row

* reference
   - 1000 Sales Records : https://excelbianalytics.com/wp/downloads-18-sample-csv-files-data-sets-for-testing-sales/

In [21]:
# load data
raw_primary = pd.read_csv("data/1000 Sales Records.csv")

# declare 500 data and just store rows
primary_500 = raw_primary.head(500)

# show first 3 rows
primary_500.head(3)



Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit
0,Middle East and North Africa,Libya,Cosmetics,Offline,M,10/18/2014,686800706,10/31/2014,8446,437.2,263.33,3692591.2,2224085.18,1468506.02
1,North America,Canada,Vegetables,Online,M,11/7/2011,185941302,12/8/2011,3018,154.06,90.93,464953.08,274426.74,190526.34
2,Middle East and North Africa,Libya,Baby Food,Offline,C,10/31/2016,246222341,12/9/2016,1517,255.28,159.42,387259.76,241840.14,145419.62


## 1-1 Load Secondary Metadata File 
* load secondary file
- Coupon reference: https://www.kaggle.com/datasets/rishikumarrajvansh/marketing-insights-for-e-commerce-company?select=Discount_Coupon.csv
- City reference: https://www.kaggle.com/datasets/dataanalyst001/all-capital-cities-in-the-world


In [22]:
raw_city_data = pd.read_csv("data/all capital cities in the world.csv")
raw_coupon_data = pd.read_csv("data/Discount_Coupon.csv")

## 2. Pick the Right Container
* Dictionary: made up of keys and values. quickly access a value using its key but can’t have duplicate keys
* Set: unordered collection of unique values, so duplicates aren’t allowed
* Namedtuple: immutable collection, so we can’t modify or delete its elements. However, it keeps the order and allows access by index.

For this lab assignment, I wll mainly use dictionaries and sets along with the dataframe.


In [23]:
# check data info/description
#primary_500.info()
#primary_500.describe()
#primary_500.isnull().sum()



## 3. Implement Functions and  Data structure
* In total, I created 4 main functions along with 3 helper functions to generate the missing columns
    * standardize(): keeps column names consistent and handles type casting
    * clean(): checks for null values, removes duplicates, and performs general cleaning
    * total: calculates total revenue using price and quantity
    * total_with_discount: calculates total revenue using price, quantity, and the discount rate

In [None]:
class Lab2Kihoon:
    
    def __init__(self):
        pass

    def standardize(data):
        
        # STEP1: rename columns to lowercase and remove space start and end for consitent
        data.columns = data.columns.str.lower().str.strip()

        # STEP2: replace space to _
        data.columns = data.columns.str.replace(" ","_")
        
        # STEP3: rename coulmns(Must contain)
        # example: item_type to product

        data = data.rename(columns={
            "item_type":"product",
            "unit_price":"price",
            "units_sold":"quantity",
            "order_date":"date"
            })

        
        # Step 4: type casting

        data["date"] = pd.to_datetime(data["date"])
        data["ship_date"] = pd.to_datetime(data["ship_date"])
        
        data["product"] = data["product"].astype("string")
        data["order_id"] = data["order_id"].astype("string")
        data["region"] = data["region"].astype("string")
        data["country"] = data["country"].astype("string")
        data["sales_channel"] = data["sales_channel"].astype("string")
        data["order_priority"] = data["order_priority"].astype("string")



        data["price"] = pd.to_numeric(data["price"])
        data["unit_cost"] = pd.to_numeric(data["unit_cost"])
        data["total_cost"] = pd.to_numeric(data["total_cost"])
        data["total_revenue"] = pd.to_numeric(data["total_revenue"])
        data["total_profit"] = pd.to_numeric(data["total_profit"])


        
        return data

    def clean(data):

        # print raw length
        print(f"Before row count: {len(data)}")

        
        # Step 5: some rows may not have a value, so fill with "NONE"
        data["coupon_code"] = data["coupon_code"].fillna("NONE")
        # In the current primary dataset, some countries don’t have matching city data
        # creates nulls in shipping_city. Fill those with "NONE" as well
        data["shipping_city"] = data["shipping_city"].fillna("NONE")
        # Step 6: Dropping rows with missing values (optional)
        data = data.dropna(subset=["date", "price", "quantity"])
        
        # Step 7: drop duplicate row
        data = data.drop_duplicates()

        # Step 8: strip extra whitespace in string columns
        data["region"] = data["region"].str.strip()
        data["country"] = data["country"].str.strip()
        data["sales_channel"] = data["sales_channel"].str.strip()
        data["order_priority"] = data["order_priority"].str.strip()
        data["order_id"] = data["order_id"].str.strip()
        data["product"] = data["product"].str.strip()

        print(f"After row count: {len(data)}")
        return data 

        
    def total(data):
        total = (data["price"] * data["price"]).sum()
        return total

    def total_with_coupon(data):
        total = (data["price"] * data["quantity"] * (1 - data["numeric_discount"]/100)).sum()
        return total
    


## STEP8(Partial). Transformations

- some required columns (shipping_city, coupon_code, customer_id) were missing, I performed partial transformations
    * add_customer_id: generates a customer ID using existing columns in the primary dataset
    * add_shipping_city(): joins the primary dataset with secondary metadata to add the shipping_city column
    * add_coupon_id(): joins the primary dataset with secondary metadata to add the coupon_id column

In [25]:
# making customer_id: combination of year +"CUST"+ order_id 

def add_customer_id(data):
    year = pd.to_datetime(data["date"]).dt.year.astype(str)
    order_id = data["order_id"].astype(str)
    
    data["customer_id"] = (year+"CUST"+order_id)


    data["customer_id"] = data["customer_id"].astype("string")

    return data

In [26]:
def add_shipping_city(primary_500, raw_city_data):

    # standilized value
    primary_500["country_key"] = (primary_500["country"].astype(str).str.strip().str.lower())


    needed_city_data = raw_city_data[["Country","Capital City"]]

    # rename column 
    needed_city_data = (needed_city_data[["Country", "Capital City"]].rename(columns={
        "Country": "sd_country",
        "Capital City": "sd_city"
        }))

    # standilized value
    needed_city_data["country_key"] = (needed_city_data["sd_country"].astype(str).str.strip().str.lower())

    # joim two dataset
    primary_500["sd_country"] = primary_500["country"]
    primary_500 = primary_500.merge(needed_city_data[["country_key", "sd_city"]], on="country_key", how="left")


    primary_500 = primary_500.drop(columns=["country_key","sd_country"])
    primary_500 = primary_500.rename(columns={
        "sd_city": "shipping_city"
    })

    primary_500["shipping_city"] = primary_500["shipping_city"].astype("string")

    return primary_500



In [27]:
def add_coupon_id(primary_500,raw_coupon_data):
    
    primary_500["order_month"] = primary_500["date"].dt.strftime("%b")

    needed_coupon_data = raw_coupon_data[["Month","Coupon_Code"]]

    # rename column 
    needed_coupon_data.columns = needed_coupon_data.columns.str.lower().str.strip()
    needed_coupon_data.columns = needed_coupon_data.columns.str.replace(" ","_")

    code_dict = needed_coupon_data.groupby("month")["coupon_code"].apply(list).to_dict()

    random_code = []

    for month_key in primary_500["order_month"]:
        coupon_code_list = code_dict.get(month_key)

        select = random.choice(coupon_code_list)

        random_code.append(select)

    primary_500["coupon_code"] = random_code

    primary_500["coupon_code"] = primary_500["coupon_code"].astype("string")

    return primary_500

In [None]:

primary_500 = Lab2Kihoon.standardize(primary_500)

primary_500 = add_coupon_id(primary_500,raw_coupon_data)

primary_500 = add_shipping_city(primary_500, raw_city_data)

primary_500 = add_customer_id(primary_500)


In [29]:
primary_500

Unnamed: 0,region,country,product,sales_channel,order_priority,date,order_id,ship_date,quantity,price,unit_cost,total_revenue,total_cost,total_profit,order_month,coupon_code,shipping_city,customer_id
0,Middle East and North Africa,Libya,Cosmetics,Offline,M,2014-10-18,686800706,2014-10-31,8446,437.20,263.33,3692591.20,2224085.18,1468506.02,Oct,NOTES10,Tripoli,2014CUST686800706
1,North America,Canada,Vegetables,Online,M,2011-11-07,185941302,2011-12-08,3018,154.06,90.93,464953.08,274426.74,190526.34,Nov,AIO20,Ottawa,2011CUST185941302
2,Middle East and North Africa,Libya,Baby Food,Offline,C,2016-10-31,246222341,2016-12-09,1517,255.28,159.42,387259.76,241840.14,145419.62,Oct,SALE10,Tripoli,2016CUST246222341
3,Asia,Japan,Cereal,Offline,C,2010-04-10,161442649,2010-05-12,3322,205.70,117.11,683335.40,389039.42,294295.98,Apr,HGEAR10,Tokyo,2010CUST161442649
4,Sub-Saharan Africa,Chad,Fruits,Offline,H,2011-08-16,645713555,2011-08-31,9845,9.33,6.92,91853.85,68127.40,23726.45,Aug,AND20,N'Djamena,2011CUST645713555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Middle East and North Africa,Algeria,Meat,Offline,M,2011-09-02,183022201,2011-10-15,9191,421.89,364.69,3877590.99,3351865.79,525725.20,Sep,AND30,Algiers,2011CUST183022201
496,Europe,Italy,Personal Care,Online,L,2011-03-21,127589738,2011-04-02,5494,81.73,56.67,449024.62,311344.98,137679.64,Mar,ACC30,Rome,2011CUST127589738
497,Europe,Russia,Fruits,Offline,L,2011-01-08,221530139,2011-01-26,4546,9.33,6.92,42414.18,31458.32,10955.86,Jan,EXTRA10,Moscow,2011CUST221530139
498,Central America and the Caribbean,Antigua and Barbuda,Office Supplies,Offline,M,2015-02-22,363329732,2015-02-22,6197,651.21,524.96,4035548.37,3253177.12,782371.25,Feb,NE20,Saint John's,2015CUST363329732


## 4. Bulk Loaded
* dictionary was created with product as the key and the total profit for each product as the value. This makes it easy to see the total profit broken down by product

In [30]:
profit_by_product__dict = primary_500.groupby("product")["total_profit"].sum().to_dict()

profit_by_product__dict

{'Baby Food': 24471524.24,
 'Beverages': 3717621.36,
 'Cereal': 19037016.51,
 'Clothes': 13516485.12,
 'Cosmetics': 36463320.92,
 'Fruits': 387887.09,
 'Household': 34060166.68,
 'Meat': 11766440.4,
 'Office Supplies': 33954306.25,
 'Personal Care': 6171175.36,
 'Snacks': 9934408.379999999,
 'Vegetables': 12445700.72}

## 5. Quick Profiling
* use built-in functions to calculate the minimum, mean, and maximum of the price column
* count the number of uniquu shipping cities using a set

In [31]:

# min/max/mean
min_price = primary_500["price"].min()
mean_price = primary_500["price"].mean()
max_price = primary_500["price"].max()

print(f"1. min price : {min_price}")
print(f"2. mean price : {mean_price}")
print(f"3. max price : {max_price}")


# unique city
cities_set = set(primary_500["shipping_city"])
unique_cities = len(cities_set)
print(f"4. unique cities count : {unique_cities}")



1. min price : 9.33
2. mean price : 274.29506
3. max price : 668.27
4. unique cities count : 166


## 6.  Spot the Grime
* order_id I personally think that order_id should act as a unique identifier, I checked for duplicates
* monetary values: I think they must be greater than or equal to zero, so I checked for negatives
* shipping_city: I think null values are not acceptable, so I checked for any missing entries.

In [None]:
# 1) checked for duplicates
duplicated_order_id = primary_500["order_id"].duplicated().sum()

print(f"1. The number of duplicated order_id: {duplicated_order_id}")
# 2) negative value check in price column
negative_price = (primary_500["price"]<0).sum()
negative_unit_cost = (primary_500["unit_cost"]<0).sum()
negative_total_cost = (primary_500["total_cost"]<0).sum()
negative_total_profit = (primary_500["total_profit"]<0).sum()
negative_total_revenue = (primary_500["total_revenue"]<0).sum()

print(f"2. The number of negative value in price: {negative_price}")
print(f"2. The number of negative value in unit_cost: {negative_unit_cost}")
print(f"2. The number of negative value in total_cost: {negative_total_cost}")
print(f"2. The number of negative value in total_profit: {negative_total_profit}")
print(f"2. The number of negative value in total_revenue: {negative_total_revenue}")



# 3) Null check in shipping city

null_city = primary_500["shipping_city"].isnull().sum()

print(f"3. The number of null value in shipping_city: {null_city}")



1. The number of duplicated order_id: 0
2. The number of negative value in price: 0
2. The number of negative value in unit_cost: 0
2. The number of negative value in total_cost: 0
2. The number of negative value in total_profit: 0
2. The number of negative value in total_revenue: 0
3. The number of null value in shipping_city: 17


## 7. Cleaning Rules
1) String columns: missing values were filled with None
2) Essential columns: rows with null values were dropped
3) Removed duplicates
4) Trimmed extra whitespace
5) Printed the values before and after cleaning

In [33]:
Lab2Kihoon.clean(primary_500)

Before row count: 500
After row count: 500


Unnamed: 0,region,country,product,sales_channel,order_priority,date,order_id,ship_date,quantity,price,unit_cost,total_revenue,total_cost,total_profit,order_month,coupon_code,shipping_city,customer_id
0,Middle East and North Africa,Libya,Cosmetics,Offline,M,2014-10-18,686800706,2014-10-31,8446,437.20,263.33,3692591.20,2224085.18,1468506.02,Oct,NOTES10,Tripoli,2014CUST686800706
1,North America,Canada,Vegetables,Online,M,2011-11-07,185941302,2011-12-08,3018,154.06,90.93,464953.08,274426.74,190526.34,Nov,AIO20,Ottawa,2011CUST185941302
2,Middle East and North Africa,Libya,Baby Food,Offline,C,2016-10-31,246222341,2016-12-09,1517,255.28,159.42,387259.76,241840.14,145419.62,Oct,SALE10,Tripoli,2016CUST246222341
3,Asia,Japan,Cereal,Offline,C,2010-04-10,161442649,2010-05-12,3322,205.70,117.11,683335.40,389039.42,294295.98,Apr,HGEAR10,Tokyo,2010CUST161442649
4,Sub-Saharan Africa,Chad,Fruits,Offline,H,2011-08-16,645713555,2011-08-31,9845,9.33,6.92,91853.85,68127.40,23726.45,Aug,AND20,N'Djamena,2011CUST645713555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Middle East and North Africa,Algeria,Meat,Offline,M,2011-09-02,183022201,2011-10-15,9191,421.89,364.69,3877590.99,3351865.79,525725.20,Sep,AND30,Algiers,2011CUST183022201
496,Europe,Italy,Personal Care,Online,L,2011-03-21,127589738,2011-04-02,5494,81.73,56.67,449024.62,311344.98,137679.64,Mar,ACC30,Rome,2011CUST127589738
497,Europe,Russia,Fruits,Offline,L,2011-01-08,221530139,2011-01-26,4546,9.33,6.92,42414.18,31458.32,10955.86,Jan,EXTRA10,Moscow,2011CUST221530139
498,Central America and the Caribbean,Antigua and Barbuda,Office Supplies,Offline,M,2015-02-22,363329732,2015-02-22,6197,651.21,524.96,4035548.37,3253177.12,782371.25,Feb,NE20,Saint John's,2015CUST363329732


## 8. Transformations
* The coupon_code column is currently stored as string. I extract only the numeric part and transform it into a new column called numeric_discount
* Country should transform to Title case

In [34]:
# 1. parse string to numeric
primary_500["numeric_discount"] = pd.to_numeric(primary_500["coupon_code"].str.extract(r"(\d+)")[0])


# 2. Uppercase
primary_500["country"] = primary_500["country"].str.title()

primary_500[["country","coupon_code","numeric_discount"]]

Unnamed: 0,country,coupon_code,numeric_discount
0,Libya,NOTES10,10
1,Canada,AIO20,20
2,Libya,SALE10,10
3,Japan,HGEAR10,10
4,Chad,AND20,20
...,...,...,...
495,Algeria,AND30,30
496,Italy,ACC30,30
497,Russia,EXTRA10,10
498,Antigua And Barbuda,NE20,20


## 9. Feature Engineering
* Calculated how long ago each transaction occurred by comparing the date column with today’s date

In [35]:
today = pd.to_datetime(pd.Timestamp.today().date())
primary_500["days_since_purchase"] = (today - primary_500["date"]).dt.days


primary_500[["date","days_since_purchase"]]

Unnamed: 0,date,days_since_purchase
0,2014-10-18,4000
1,2011-11-07,5076
2,2016-10-31,3256
3,2010-04-10,5652
4,2011-08-16,5159
...,...,...
495,2011-09-02,5142
496,2011-03-21,5307
497,2011-01-08,5379
498,2015-02-22,3873


## 10. Mini-Aggregation
* Used pandas groupby to show total revenue by shipping_city

In [36]:
revenue_by_shipping_city = primary_500.groupby("shipping_city")["total_revenue"].sum()

revenue_by_shipping_city

shipping_city
Abu Dhabi       665128.18
Abuja           131748.76
Accra          8138014.18
Addis Ababa     281251.62
Algiers        5752785.28
                  ...    
Wellington     6533529.54
Windhoek        344052.60
Yaounde          95209.92
Yerevan        7103563.75
Zagreb          879495.94
Name: total_revenue, Length: 166, dtype: float64

## 11. Serialization Checkpoint
*  Save the final cleaned dataset including the newly added columns from Steps 8,9(order_month...)into both CSV and JSON formats

In [37]:
# serialize to csv
primary_500.to_csv("output/final_clean_data.csv")

# serialzie to json(record = one row is one object)
primary_500.to_json("output/final_clean_data.json", orient="records")

## 12. Soft Interview Reflection

By structuring the code to funtions, I think It make the process resuable. I made six funtions in this project with clean() and standardize() forming the core. These functions enabled consistent schema normalization and basic data cleaning through simple function calls, streamlining the workflow. Also, whenver I have to change the code, I only needed to update the relevant functions, which greatly improved maintainability and reproducibility. Additionally, the overall readability of the code was significantly enhanced


In [38]:
primary_500.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   region               500 non-null    string        
 1   country              500 non-null    string        
 2   product              500 non-null    string        
 3   sales_channel        500 non-null    string        
 4   order_priority       500 non-null    string        
 5   date                 500 non-null    datetime64[ns]
 6   order_id             500 non-null    string        
 7   ship_date            500 non-null    datetime64[ns]
 8   quantity             500 non-null    int64         
 9   price                500 non-null    float64       
 10  unit_cost            500 non-null    float64       
 11  total_revenue        500 non-null    float64       
 12  total_cost           500 non-null    float64       
 13  total_profit         500 non-null  

# Data Dictionary

| Field| Type| Description|Source|How created|
|------|-------|-------|------|-------|
|region|string|Sales region|Primary|Original|
|country|string|Country|Primary|Original|
|product|string|product category|Derived(Primary)|rename item_type to product|
|sales_channel|string|Sales channel|Primary|Original|
|order_priority|string|Order priority|Primary|Original|
|date|date|Order date|Derived(Primary)|rename order_date to date|
|order_id|string|UniqueorderID|Primary|Original|
|ship_date|date|Shippingdate|Primary|Original|
|quantity|int|Unit sold|Derived(Primary)|rename Units sold to quantity|
|price|float64|Unit price|Derived(Primary)|rename Unit price to price|
|unit_cost|float64|Unit cost|Primary|Original|
|total_revenue|float64|Total revenue|Primary|Original|
|total_cost|float64|Total cost|Primary|Original|
|total_profit|float64|Total profit|Primary|Original|
|customer_id|string|Customer id|Combination(Primary)|date.year+'CUST'+order_id|
|order_month|string|Order month|Derived(Primary)|date→strftime('%b')|
|coupon_code|string|Coupon code|Derived(Secondary)|Join month with coupon metadata and fill the value random |
|shipping_city|string|Shipping city|Derived(Secondary)|Join country to city metadata|
|numeric_discount|int|Discount percentage|Derived(Secondary)|Extract digits from coupon_code|
|days_since_purchase|int|Days since purchase|Derived(Primary)|today−date|

## Secondary Fields Used (for reference)
| File|Used Field|Notes|
|------|-------|-------|
| Discount_Coupon.csv | Month, Coupon_Code | Left Join using month in primary to make coupon_code |
| City Metadata | Country, Capital City | Mapping Country→shipping_city |
