In [1]:
import os
import pandas as pd

# folder where CSV files are located
folder_path = 'LufthansaTask'

# List all files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize a dictionary to store DataFrames, where each file is a key
data_dict = {}

# Loop through each CSV file
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    
    # Read the CSV file into a DataFrame
    data = pd.read_csv(file_path)
    
    # Store the DataFrame in the dictionary with the file name as the key
    data_dict[file] = data
    print(file_path)

LufthansaTask\olist_customers_dataset.csv
LufthansaTask\olist_geolocation_dataset.csv
LufthansaTask\olist_orders_dataset.csv
LufthansaTask\olist_order_items_dataset.csv
LufthansaTask\olist_order_payments_dataset.csv
LufthansaTask\olist_order_reviews_dataset.csv
LufthansaTask\olist_products_dataset.csv
LufthansaTask\olist_sellers_dataset.csv
LufthansaTask\product_category_name_translation.csv


In [2]:
# example printing first records of data_dict['olist_customers_dataset.csv'].head()
# Print top 5 rows for each dataset
for file, df in data_dict.items():
    print(f"\n--- {file} ---\n")
    print(df.head())
    print(f"Dimensions of {file} are: {df.shape}")
    
    print("---------------------------------------------------------------------------------------------------")



--- olist_customers_dataset.csv ---

                        customer_id                customer_unique_id  \
0  06b8999e2fba1a1fbc88172c00ba8bc7  861eff4711a542e4b93843c6dd7febb0   
1  18955e83d337fd6b2def6b18a428ac77  290c77bc529b7ac935b93aa66c333dc3   
2  4e7b3e00288586ebd08712fdd0374a03  060e732b5b29e8181a18229c7b0b2b5e   
3  b2b6027bc5c5109e529d4dc6358b12c3  259dac757896d24d7702b9acbbff3f3c   
4  4f2d8ab171c80ec8364f7c12e35b23ad  345ecd01c38d18a9036ed96c73b8d066   

   customer_zip_code_prefix          customer_city customer_state  
0                     14409                 franca             SP  
1                      9790  sao bernardo do campo             SP  
2                      1151              sao paulo             SP  
3                      8775        mogi das cruzes             SP  
4                     13056               campinas             SP  
Dimensions of olist_customers_dataset.csv are: (99441, 5)
---------------------------------------------------------

## Check data quality for each dataset

In [4]:
# function to check data quality
def check_data_quality(data_dict):
    for file_name, df in data_dict.items():
        print(f"\n========== Analyzing {file_name} ==========\n")
        
        # display data types
        print("Data Types:")
        print(df.dtypes)
        
        # check for missing values
        print("\nMissing Values:")
        print(df.isnull().sum())

        # check for duplicate rows
        print("\nDuplicate Rows:", df.duplicated().sum())

        # display general statistics
        print("\nSummary Statistics:")
        print(df.describe(include='all'))  # Include categorical data as well

        print("\n==================================================================================================================\n")

# Run the data quality check
check_data_quality(data_dict)



Data Types:
customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object

Missing Values:
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

Duplicate Rows: 0

Summary Statistics:
                             customer_id                customer_unique_id  \
count                              99441                             99441   
unique                             99441                             96096   
top     06b8999e2fba1a1fbc88172c00ba8bc7  8d50f5eadf50201ccdcedfb9e2ac8455   
freq                                   1                                17   
mean                                 NaN                               NaN   
std                                  NaN                               NaN   
min                            

## Do all the neccessary data preprocessing/cleaning for all datasets

#### Group By geolocation_zip_code_prefix from olist_geolocation_dataset dataset

In [7]:
# load the geolocation dataset
geolocation = data_dict['olist_geolocation_dataset.csv']

# aggregate duplicate zip codes by taking median lat/lng and most frequent city/state
geolocation = geolocation.groupby('geolocation_zip_code_prefix').agg({
    'geolocation_lat': 'median',
    'geolocation_lng': 'median',
    'geolocation_city': lambda x: x.mode()[0],   # simpler and clearer
    'geolocation_state': lambda x: x.mode()[0]
}).reset_index()


In [8]:
geolocation.to_csv('geolocation_cleaned.csv', index=False, sep=';')

In [9]:
geolocation.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1001,-23.550381,-46.634027,sao paulo,SP
1,1002,-23.548551,-46.635072,sao paulo,SP
2,1003,-23.548977,-46.635313,sao paulo,SP
3,1004,-23.549535,-46.634771,sao paulo,SP
4,1005,-23.549612,-46.636532,sao paulo,SP


In [10]:
geolocation.shape

(19015, 5)

#### Drop NA values, convert all timestamps columns in datetime format at olist_orders_dataset dataset

In [12]:
# drop NA values from orders dataset
orders = 'olist_orders_dataset.csv'
orders = data_dict[orders].dropna()

print(orders.shape)

(96461, 8)


In [13]:
# list of columns to convert to datetime
timestamp_columns = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]

# convert the timestamp columns to datetime format
for col in timestamp_columns:
    orders[col] = pd.to_datetime(orders[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orders[col] = pd.to_datetime(orders[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orders[col] = pd.to_datetime(orders[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orders[col] = pd.to_datetime(orders[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

#### Convert shipping_limit_date which is a timestamp column to datetime format at olist_order_items_dataset dataset

In [15]:
order_items = data_dict['olist_order_items_dataset.csv']

# convert 'shipping_limit_date' to datetime
order_items['shipping_limit_date'] = pd.to_datetime(order_items['shipping_limit_date'])

#### Convert review_answer_timestamp to datetime format and fill the emplty records with 'No Comment' from olist_order_reviews_dataset.csv

In [17]:
order_reviews = data_dict['olist_order_reviews_dataset.csv']

# convert 'shipping_limit_date' to datetime
order_reviews['review_answer_timestamp'] = pd.to_datetime(order_reviews['review_answer_timestamp'])

In [18]:
# fill missing values in review_comment_title and review_comment_message with a comment
order_reviews['review_comment_title'] = order_reviews['review_comment_title'].fillna('No Comment')
order_reviews['review_comment_message'] = order_reviews['review_comment_message'].fillna('No Comment')

#### Drop NA values from olist_products_dataset.csv dataset

In [20]:
# drop NA values from products dataset
products = 'olist_products_dataset.csv'
products = data_dict[products].dropna()

print(products.shape)


(32340, 9)


In [21]:
products.head()

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0


In [22]:
customers = data_dict['olist_customers_dataset.csv']
sellers = data_dict['olist_sellers_dataset.csv']

# merge customers with geolocation
customers = customers.merge(geolocation, left_on="customer_zip_code_prefix", right_on="geolocation_zip_code_prefix", how="left"
).drop(columns=["geolocation_zip_code_prefix"])

# merge sellers with geolocation
sellers = sellers.merge(geolocation, left_on="seller_zip_code_prefix", right_on="geolocation_zip_code_prefix", how="left"
).drop(columns=["geolocation_zip_code_prefix"])

## Creating Calculated Columns

In [24]:
order_payments = data_dict['olist_order_payments_dataset.csv']

### Total Price: Sum of product price and freight value.

In [26]:
# Total Price: Sum of product price and freight value.
order_items["total_price"] = order_items["price"] + order_items["freight_value"]

### Delivery Time: Difference between the delivery date and the order purchase date.

In [28]:
# Delivery Time: Difference between the delivery date and the order purchase date.
# merge order_items and orders
order_items_orders = order_items.merge(orders, on="order_id", how="left")

order_items_orders["delivery_time"] = (order_items_orders["order_delivered_customer_date"] - order_items_orders["order_purchase_timestamp"]).dt.days

### Payment Count: Sum of payment installments for each order.

In [30]:
# Payment Count: Sum of payment installments for each order.
payment_count = order_payments.groupby("order_id", as_index=False).agg(payment_count=("payment_installments", "sum"))

# 2. Merge order_items_orders with payment_count
order_items_orders_payments = order_items_orders.merge(payment_count, on="order_id", how="left")

### Profit Margin: Subtract freight value from product price to calculate a rough profit estimate.

In [32]:
# create the 'profit_margin' column
order_items_orders_payments["profit_margin"] = order_items_orders_payments["price"] - order_items_orders_payments["freight_value"]

## Using Window Functions Over Partitions (Pandas)

### Total Sales per Customer: A running total of product price for each customer partitioned by Customer ID

In [35]:
# merge order_items_orders_payments with customers
order_items_orders_payments_customers = order_items_orders_payments.merge(customers, on="customer_id", how="left")

order_items_orders_payments_customers["total_sales_per_customer"] = order_items_orders_payments_customers.groupby("customer_id")["total_price"].cumsum()

In [36]:
order_items_orders_payments_customers.shape

(112650, 27)

### Average Delivery Time per Product Category: A rolling average of delivery time partitioned by product category.

In [38]:
# Merge order_items_orders_payments_customers with products on 'product_id'
order_items_orders_payments_customers_products = order_items_orders_payments_customers.merge(products, on="product_id", how="left")
order_items_orders_payments_customers_products["avg_delivery_time_per_product_category"] = order_items_orders_payments_customers_products.groupby("product_category_name")["delivery_time"].transform(lambda x: x.rolling(window=5, min_periods=1).mean())

In [39]:
order_items_orders_payments_customers_products = order_items_orders_payments_customers_products.dropna()

In [40]:
order_items_orders_payments_customers_products.shape

(108357, 36)

## Saving Processed Data to SQL Server (Fact & Dimension Tables)

In [42]:
import urllib
from sqlalchemy import create_engine

In [43]:
# merge order_items_orders_payments with sellers
order_items_orders_payments_sellers = order_items_orders_payments.merge(sellers, on="seller_id", how="left")
# Merge order_items_orders_payments_sellers with products on 'seller_id'
order_items_orders_payments_sellers_products = order_items_orders_payments_sellers.merge(products, on="product_id", how="left")

#### Fact Table

In [45]:
# Fact Table: Includes calculated columns like Total Price, Delivery Time, etc.
fact_table = order_items_orders_payments_customers_products[[
    "order_id", "order_item_id", "product_id", "seller_id", "customer_id", "total_price",
    "delivery_time", "payment_count", "profit_margin", "total_sales_per_customer",
    "avg_delivery_time_per_product_category"
]]

#### Dimension Tables

In [47]:
# Customer Dimension Table
customer_dimension = order_items_orders_payments_customers_products[[
    "customer_id", "customer_zip_code_prefix", "customer_city", "customer_state"
]].drop_duplicates()

# Product Dimension Table
product_dimension = order_items_orders_payments_customers_products[[
    "product_id", "product_category_name", "product_name_lenght",
    "product_description_lenght", "product_photos_qty", "product_weight_g",
    "product_length_cm", "product_height_cm", "product_width_cm"
]].drop_duplicates()

# Seller Dimension Table
seller_dimension = order_items_orders_payments_sellers_products[[
    "seller_id", "seller_zip_code_prefix", "seller_city", "seller_state"
]].drop_duplicates()

# Date Dimension Table
date_dimension = order_items_orders_payments_customers_products[[
    "order_id", "order_purchase_timestamp", "order_delivered_customer_date"
]].drop_duplicates()

#### Connect to SQL Server

In [49]:
# connection string using Windows Authentication
connection_string = (
    r'DRIVER={ODBC Driver 17 for SQL Server};'
    r'SERVER=LAPTOP-9MIQ9SU9\SQLEXPRESS;'  # my server name
    r'DATABASE=LufthansaTask;'  # the database
    r'Trusted_Connection=yes;'
)

# create connection URL
connection_url = (
    'mssql+pyodbc:///?odbc_connect='
    + urllib.parse.quote_plus(connection_string)
)

# create the SQLAlchemy engine
engine = create_engine(connection_url, isolation_level="AUTOCOMMIT")

#### Save Tables to SQL Server

In [51]:
# Save the Fact Table
fact_table.to_sql("fact_order_items", con=engine, if_exists="replace", index=False)

# Save the Customer Dimension Table
customer_dimension.to_sql("dim_customers", con=engine, if_exists="replace", index=False)

# Save the Product Dimension Table
product_dimension.to_sql("dim_products", con=engine, if_exists="replace", index=False)

# Save the Seller Dimension Table
seller_dimension.to_sql("dim_sellers", con=engine, if_exists="replace", index=False)

# Save the Date Dimension Table
date_dimension.to_sql("dim_dates", con=engine, if_exists="replace", index=False)

504