Description: 

# ----------------------------------
# Import required Libraries
# ----------------------------------

In [41]:
import pandas as pd
import mysql.connector
import os
from datetime import datetime
from dotenv import load_dotenv

load_dotenv()

True

# ----------------------------------
# Load Environment Variables
# ----------------------------------

In [42]:
DB_CONFIG = {
    "host": os.getenv("DB_HOST"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "database": os.getenv("DB_NAME"),
    "port": os.getenv("DB_PORT")
}

# ----------------------------------
# Database Connection
# ----------------------------------

In [43]:
conn = mysql.connector.connect(**DB_CONFIG)
cursor = conn.cursor()

# ----------------------------------
# Extraction
# ----------------------------------

In [66]:
customers = pd.read_csv("../customers_raw.csv")
products = pd.read_csv("../products_raw.csv")
sales = pd.read_csv("../sales_raw.csv")

# ----------------------------------
# Utility Functions
# ----------------------------------

In [72]:
# General function to check and handle missing values in dataframes
def handle_missing_values(df):
    missing_col=[]
    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    missing_summary = df.isnull().sum()
    for col, cnt in missing_summary.items():
        if cnt > 0:
            missing_col.append(f"col {col}: {cnt} missing values")
            if col in numeric_cols:
                df[col].fillna(df[col].mean(), inplace=True)
            else:
                df.dropna(subset=[col], inplace=True)

    return df, missing_col

def parse_date(date_str):
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%m-%d-%Y", "%m/%d/%Y"):
        try:
            return datetime.strptime(str(date_str), fmt).date()
        except ValueError:
            continue
    return None

def standardize_phone(phone):
    if pd.isna(phone):
        return None
    digits = ''.join(filter(str.isdigit, str(phone)))
    return "+91-" + digits[-10:]

In [73]:
# Transform: Customers
customers.drop_duplicates(inplace=True)
customers, missing_cust_cols = handle_missing_values(customers)
customers['registration_date'] = customers['registration_date'].apply(parse_date)
customers['phone'] = customers['phone'].apply(standardize_phone)

In [74]:
customers

Unnamed: 0,customer_id,first_name,last_name,email,phone,city,registration_date
0,C001,Rahul,Sharma,rahul.sharma@gmail.com,+91-9876543210,Bangalore,2023-01-15
1,C002,Priya,Patel,priya.patel@yahoo.com,+91-9988776655,Mumbai,2023-02-20
3,C004,Sneha,Reddy,sneha.reddy@gmail.com,+91-9123456789,Hyderabad,2023-04-15
4,C005,Vikram,Singh,vikram.singh@outlook.com,+91-9988112233,Chennai,2023-05-22
5,C006,Anjali,Mehta,anjali.mehta@gmail.com,+91-9876543210,Bangalore,2023-06-18
7,C008,Pooja,Iyer,pooja.iyer@gmail.com,+91-9123456780,Bangalore,2023-08-15
8,C009,Karthik,Nair,karthik.nair@yahoo.com,+91-9988776644,Kochi,2023-09-30
9,C010,Deepa,Gupta,deepa.gupta@gmail.com,+91-9871234567,Delhi,2023-10-12
11,C011,Arjun,Rao,arjun.rao@gmail.com,+91-9876509876,Hyderabad,2023-11-05
13,C013,Suresh,Patel,suresh.patel@outlook.com,+91-9123409876,Mumbai,2024-01-08
