In [1]:
import pandas as pd
import datetime

customers_df      = pd.read_csv("./csv/olist_customers_dataset.csv")
products_df       = pd.read_csv("./csv/olist_products_dataset.csv")
sellers_df        = pd.read_csv("./csv/olist_sellers_dataset.csv")
orders_df         = pd.read_csv("./csv/olist_orders_dataset.csv")
order_payments_df = pd.read_csv("./csv/olist_order_payments_dataset.csv")
order_reviews_df  = pd.read_csv("./csv/olist_order_reviews_dataset.csv")
order_items_df    = pd.read_csv("./csv/olist_order_items_dataset.csv")
geolocation_df    = pd.read_csv("./csv/olist_geolocation_dataset.csv")
#category_df = pd.read_csv("./datasets/category.csv")

#CLEAR DATA
#-----------------------------------------------------------------------------------------------------------------

# Geolocation
# Suppression de 261831 doublons de la tavle geolocation (utilisé ['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng'])
clean_geo_df = geolocation_df.drop_duplicates(subset=['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng'])

# Payments
# 3 paiements ont un type de paiement non défini et un montant de 0 on peut les supprimmer ainsi que 6 autres paiements par bon d'achat avec un montant de 0
clean_pay_df = order_payments_df[order_payments_df.payment_value != 0]

# Products
# supprimer trois colonnes : product_name_lenght, product_description_lenght et product_photos_qty
clean_products_df = products_df.drop(["product_name_lenght", "product_description_lenght", "product_photos_qty"], axis="columns")

#QUERIES
#-----------------------------------------------------------------------------------------------------------------

# Nombre de clients total
total_customers = customers_df.customer_id.count()

# Nombre de produits total
total_products = clean_products_df.product_id.count()

# Nombre de commandes total
total_orders = orders_df.order_id.count()

# Nombre de commandes selon leurs états (en cours de livraison etc...)
total_orders_by_status = orders_df.order_status.value_counts()

# Nombre de commandes par mois
orders_df["month"] = pd.DatetimeIndex(orders_df.order_purchase_timestamp).month


total_orders_by_month = orders_df.month.value_counts().sort_index()

# Panier moyen d'un client
mean_payment = clean_pay_df.payment_value.mean()

# Score de satisfaction moyen (notation sur la commande)
mean_reviews = order_reviews_df.review_score.mean()

# Nombre de vendeurs
total_sellers = sellers_df.seller_id.count()

# Nombre de vendeurs par région
total_sellers_by_state = sellers_df.seller_state.value_counts()

# Durée moyenne entre la commande et la livraison
delivered  = pd.to_datetime(orders_df.order_delivered_customer_date)
purchase   = pd.to_datetime(orders_df.order_purchase_timestamp)
delivery_duration = delivered - purchase
delivery_duration_mean = delivery_duration.mean()
# Quantité de produit vendu par catégorie

merged_order_items = pd.merge(order_items_df,clean_products_df)
sold_by_category = merged_order_items.product_category_name.value_counts().sort_index()

# Nombre de commande par jours
orders_df["date"] = pd.DatetimeIndex(orders_df.order_purchase_timestamp).date

total_orders_by_date = orders_df.date.value_counts().sort_index()

# Nombre de commande par ville (ville du vendeur)
merged_df = pd.merge(order_items_df, sellers_df)

total_orders_by_seller_city = merged_df.seller_city.value_counts().sort_index()

# Prix minimum des commandes

merged_payment = pd.merge(orders_df, clean_pay_df)
merged_payment = merged_payment.loc[merged_payment["order_status"] == "delivered"]
min_order_value = merged_payment.payment_value.min()

# Prix maximum des commandes

max_order_value = clean_pay_df.payment_value.max()

