In [None]:
# pip install kafka-python-ng

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from kafka import KafkaProducer
import json
from json import dumps
from time import sleep

In [None]:
pd.set_option("display.max_columns", None)

df = pd.read_csv('articles.csv')
customer_df = pd.read_csv('customers.csv')
trans_df = pd.read_csv('young_female_trans.csv')

df.head(5)

# print(df.shape[0])
# print(customer_df.shape[0])
# print(trans_df.shape[0])

# Initial Data Analysis

In [None]:
df.info()

In [None]:
# Convert those items code/no into string instead of int
df['product_code'] = df['product_code'].apply(str)
df['product_type_no'] = df['product_type_no'].apply(str)
df['graphical_appearance_no'] = df['graphical_appearance_no'].apply(str)
df['colour_group_code'] = df['colour_group_code'].apply(str)
df['perceived_colour_value_id'] = df['perceived_colour_value_id'].apply(str)
df['perceived_colour_master_id'] = df['perceived_colour_master_id'].apply(str)
df['department_no'] = df['department_no'].apply(str)
df['index_code'] = df['index_code'].apply(str)
df['index_group_no'] = df['index_group_no'].apply(str)
df['section_no'] = df['section_no'].apply(str)
df['garment_group_no'] = df['garment_group_no'].apply(str)

df.head()

In [None]:
customer_df.info()

In [None]:
trans_df.info()

In [None]:
df.describe()

In [None]:
df.describe(include="object").T

Here omitted part ..

<!-- `prod_code` & `prod_name`                   : There are a total 47,224 of unique product code and 45,875 of unique product name, indicates some of the product code are sharing the product name. The most frequent product is "Dragonfly dress", it appears 98 times within the dataset. \
\
`product_type_code` & `product_type_name`   : There are 132 unique of product type code and 131 unique of product type name. The top most product type is "Trousers", it appears 1,169 times. There is some minor different total counts, which might need to dig inside later. \
\
`product_group_name`                        : There are 19 unique product type and most frequent produce group is "Garment Upper Body", which accounts for around 40% of the product group. \
\
`graphical_appearance_no` & `graphical_appearance_name`         : Both of them having 30 unique of graphical appearance. The top graphical appearance is "Solid", it appears 49,747 times. \
\
`colour_group_code` & `colour_group_name`   : There are 50 unique colors in the product. The most frequent is "black" color which appear 22,670 times. \
\
`perceived_colour_value_id` & `perceived_colour_value_name`     : There are 8 unique perceived colors in the product. The most frequent is "Dark" color which appear 42,706 times. \
\
`perceived_colour_master_id` & `perceived_colour_master_name`   : There are 20 unique perceived colors in the product. The most frequent is "Black" color which appear 22,585 times. \
\
`department_no` & `department_name`         : There are 299 departments, the most department name is Jersey.  \
\
`index_code` & `index_name`                 : There are 10 unique index code, the top most index product is ladieswear, which appears for 26,001. \
\
`index_group_no` & `index_group_name`       : There are 5 unique index group code, the top most index product is ladieswear, which appears for 39,737. \
\
`section_no` & `section_name`               : There are 57 unique section code, but 56 in section name, the top most index product is ladieswear, which appears for 7,295. There is a minor different counts between section no and section name.  \
\
`garment_group_no` & `garment_group_name`   : There are 21 unique index code, the top most index product is ladieswear, which appears for 21,445. \
\
`detail_desc`                               : There are 43,404 of detail description, indicates some of the product code are sharing the same detail desc.
\ -->

In [None]:
customer_df.describe()

In [None]:
customer_df.describe(include="object").T

In [None]:
trans_df.info()
trans_df["t_dat"] = pd.to_datetime(trans_df["t_dat"])

In [None]:
trans_df.info()

In [None]:
trans_df.describe()

In [None]:
trans_df.describe(include="object").T


# Data Cleaning and Transformation

## Handle Missing Value

In [None]:
missing_data = df.isnull().sum()
missing_percent = ((missing_data[missing_data > 0]) / df.shape[0]) * 100

missing_percent.sort_values(ascending=True, inplace=True)

fig, ax= plt.subplots(figsize=(10, 1))
ax.barh(missing_percent.index, missing_percent, color="#2337C6")

for i, (value, name) in enumerate(zip(missing_percent, missing_percent.index)):
    ax.text(value, i, f"{value:.2f}%", ha='left', va='center')

ax.set_xlim([0,10])
plt.title("Percent of Missing Value: ", fontweight="bold")
plt.xlabel("Percent", fontsize=10)
plt.show()


In [None]:
df[df["detail_desc"].isnull()].head()

In [None]:
# df = df.dropna(subset={'detail_desc'})
df["detail_desc"] = df["detail_desc"].fillna("No Description")

In [None]:
df.isnull().sum()

In [None]:
missing_cust_data = customer_df.isnull().sum()
print(missing_cust_data)

# customer_df[customer_df["club_member_status"].isnull()].head()

In [None]:
customer_df["FN"] = customer_df["FN"].fillna(0)
customer_df["Active"] = customer_df["Active"].fillna(0)
customer_df["club_member_status"] = customer_df["club_member_status"].fillna("UNKNOWN")
customer_df["fashion_news_frequency"] = customer_df["fashion_news_frequency"].fillna("UNKNOWN")
customer_df["age"] = customer_df["age"].fillna(customer_df["age"].median()) # fill with age median?

In [None]:
customer_df.isnull().sum()

In [None]:
missing_trans_data = trans_df.isnull().sum()
print(missing_trans_data)

## Handling Duplicates

In [None]:
dup_rows = df[df.duplicated(keep=False)]
dup_rows_sorted = dup_rows.sort_values(by=["product_code"])
# dup_rows_sorted.head()
print(f"The dataset contains {df.duplicated().sum()} duplicate rows that need to be removed.")

dup_cust_rows = customer_df[customer_df.duplicated(keep=False)]
dup_cust_rows_sorted = dup_cust_rows.sort_values(by=['customer_id'])
print(f"The customers dataset contains {customer_df.duplicated().sum()} duplicate rows that need to be removed")

dup_tran_rows = trans_df[trans_df.duplicated(keep=False)]
dup_tran_rows_sorted = dup_tran_rows.sort_values(by=["t_dat","customer_id"])
print(f"The transactions dataset contains {trans_df.duplicated().sum()}")

In [None]:
df.shape[0]

# Correcting product name

In [None]:
unique_prod_name = df["prod_name"].unique() # unique the column
print(len(unique_prod_name)) # count have how many unique record

nlist = []
for word in unique_prod_name:
    if "(" in word and ")" in word:
        start_index = word.find("(")+1
        end_index = word.find(")")
        content = word[start_index:end_index]
        if(content.isdigit()):
            nlist.append(word)

print(nlist)

In [None]:
def remove_parentheses(text):
    words = text.split()
    return " ".join(word for word in words if "(" not in word and ")" not in word)

In [None]:
df["prod_name"] = df["prod_name"].apply(remove_parentheses) # apply custom function

df.head()

In [None]:
trans_df.head()

In [None]:
customer_df.head()

In [None]:
df.head()

## Prepare dataframe

In [None]:
trans_df.rename(columns={"t_dat": "trans_date"}, inplace=True)
trans_df.head()

In [None]:
def create_date_table(start="2016-01-01", end="2026-12-31"):
    date_df = pd.DataFrame({"Date": pd.date_range(start, end)})
    date_df["day"] = date_df.Date.dt.day
    date_df["day_of_week"] = date_df.Date.dt.weekday
    date_df["week"] = date_df.Date.dt.strftime("%W") # %U is sunday as first day of the week, whereas %W is monday as first day of the week
    date_df["month"] = date_df.Date.dt.month
    date_df["month_name"] = date_df.Date.dt.strftime("%B")
    date_df["quarter"] = date_df.Date.dt.quarter
    date_df["year"] = date_df.Date.dt.year

    return date_df


In [None]:
date_dim = create_date_table()

In [None]:
date_dim.head()

In [None]:
customer_dim = customer_df.filter(['customer_id', 'active', 'club_member_status', 'fashion_news_frequency', 'age'])
customer_dim.head()

In [None]:
df.rename(columns={'prod_name': 'product_name'}, inplace=True)

In [None]:
df.head()

In [None]:
product_dim = df.filter(['article_id', 'product_code', 'product_name', 'product_type_no', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_no', 'department_name', 'index_name', 'index_group_name', 'section_no', 'section_name', 'garment_group_name', 'detail_desc'])

In [None]:
product_dim.head()

# Kafka Producer

In [None]:
def send_json_data_kafka(topic, jdata):
    # create kafka producer
    producer = KafkaProducer(bootstrap_servers=['34.133.203.182:9092'], value_serializer=lambda x: dumps(x).encode('utf-8'))
    
    # create topic
    producer.send(topic, jdata)


In [None]:
trans_df.info()

In [None]:
producer = KafkaProducer(bootstrap_servers=['34.133.203.182:9092'], value_serializer=lambda x: json.dumps(x).encode('utf-8'))
# producer.flush()
trans_df["trans_date"] = trans_df["trans_date"].dt.strftime("%Y-%m-%d")
i =0 
for _, row in trans_df.iterrows():
    json_row = row.to_dict()
    
    send_json_data_kafka("trans_fact", json_row)
    
    i += 1
    if i == 100 : 
        sleep(10)
        break

In [None]:
date_dim.head()

In [None]:
def convert_timestamps(row):
    return {col: (val.strftime('%Y-%m-%d') if isinstance(val, pd.Timestamp) else val) for col, val in row.items()}

for _, row in date_dim.iterrows():
    # json_row = row.to_dict()
    message = convert_timestamps(row)
    send_json_data_kafka("date_dim", message)

In [None]:
i = 0
for _, row in customer_dim.iterrows():
    json_row = row.to_dict()
    send_json_data_kafka("customer_dim", json_row)
    
    i += 1
    if i == 300 : 
        break


In [None]:
for _, row in product_dim.iterrows():
    json_row = row.to_dict()
    send_json_data_kafka("product_dim", json_row)