In [152]:
#librerias para analizar datos
import pandas as pd
import numpy as np

#librerias de graficos
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_colwidth', None)
pd.set_option('float_format', '{:,.2f}'.format)

In [153]:
customer_r = pd.read_xml('Customer_R.xml')
customer_r.head(2)

Unnamed: 0,CUSTOMER_ID,FULL_NAME,BIRTH_DATE,CITY,STATE,ZIPCODE
0,1001,"Nowmer, Sheri",03/12/1960,Poughkeepsie,New York,21101
1,1002,"Whelply, Derrick",06/04/1960,Manhattan,New York,12112


In [154]:
customer_w = pd.read_xml('Customer_W.xml')
customer_w.head(2)

Unnamed: 0,CUSTOMER_ID,FULL_NAME,BIRTH_DATE,CITY,STATE,ZIPCODE
0,2001,"Son, Alma",08/12/1961,Millbrook,New York,24591
1,2002,"Brandon, Cornelius",09/04/1961,Manhattan,New York,12113


In [155]:
products = pd.read_csv('Products.txt', delimiter='|',names=['product_id', 'detail', 'package'])
products.head(2)

Unnamed: 0,product_id,detail,package
0,1,Kool Cola,1 Liter
1,2,Kool Cola,2 Liter


In [156]:
import pymysql

def consulta_query(query_str: str):
    # Configuración de la conexión
    conexion = pymysql.connect(
        host='localhost',  # nombre del host
        user='root',       # nombre de usuario
        password='4716650',   # aquí se deberá colocar la contraseña
        database='sales',  # nombre de la base de datos que vamos a consultar
        charset='utf8mb4',
        cursorclass=pymysql.cursors.DictCursor
    )
    
    try:
        with conexion.cursor() as cursor:
            cursor.execute(query_str)
            df = cursor.fetchall()
            columns = [i[0] for i in cursor.description]  # Nombres de las columnas
            df = pd.DataFrame(df, columns=columns)
            return df
    finally:
        conexion.close()


In [157]:
history_sales_after_2008 = consulta_query(f"""
    SELECT b.BILLING_ID as billing_id,
           b.REGION as region,
           b.BRANCH_ID as branch_id,
           b.DATE as date,
           b.CUSTOMER_ID as customer_id,
           b.EMPLOYEE_ID as employee_id,
           bd.PRODUCT_ID as product_id,
           bd.QUANTITY as quantity
      FROM billing b
      LEFT JOIN billing_detail bd
        ON b.BILLING_ID = bd.BILLING_ID;
""")
history_sales_after_2008.head(2)

Unnamed: 0,billing_id,region,branch_id,date,customer_id,employee_id,product_id,quantity
0,835920,South,1,2009-01-01 00:03:48,2298,242,2.0,5.0
1,835920,South,1,2009-01-01 00:03:48,2298,242,5.0,11.0


# TRANSFORMACIONES

In [158]:
#Transformacion de datos de los clientes

df_wholesale = customer_w.copy() #copiamos el dataframe en una variable nueva para no tocar los datos originales
df_wholesale['customer_type'] = 'Wholesale'
df_wholesale.head(2)

Unnamed: 0,CUSTOMER_ID,FULL_NAME,BIRTH_DATE,CITY,STATE,ZIPCODE,customer_type
0,2001,"Son, Alma",08/12/1961,Millbrook,New York,24591,Wholesale
1,2002,"Brandon, Cornelius",09/04/1961,Manhattan,New York,12113,Wholesale


In [159]:
df_retail = customer_r.copy() #copiamos el dataframe en una variable nueva para no tocar los datos originales
df_retail['customer_type'] = 'Retail'
df_retail.head(2)

Unnamed: 0,CUSTOMER_ID,FULL_NAME,BIRTH_DATE,CITY,STATE,ZIPCODE,customer_type
0,1001,"Nowmer, Sheri",03/12/1960,Poughkeepsie,New York,21101,Retail
1,1002,"Whelply, Derrick",06/04/1960,Manhattan,New York,12112,Retail


In [160]:
df_customer = pd.concat([df_wholesale,df_retail]).reset_index(drop=True)
df_customer['CUSTOMER_ID'] = df_customer['CUSTOMER_ID'].astype(int)
df_customer = df_customer[['CUSTOMER_ID','FULL_NAME','customer_type']]
df_customer.head(2)

Unnamed: 0,CUSTOMER_ID,FULL_NAME,customer_type
0,2001,"Son, Alma",Wholesale
1,2002,"Brandon, Cornelius",Wholesale


In [161]:
columns = df_customer.columns.tolist()
for col in columns:
    df_customer = df_customer.rename(columns={f'{col}':f'{col.lower()}'})

df_customer.head(2)
#Fin de Transformacion de datos de los clientes

Unnamed: 0,customer_id,full_name,customer_type
0,2001,"Son, Alma",Wholesale
1,2002,"Brandon, Cornelius",Wholesale


In [162]:
#Transformacion de datos de los productos
df_products = products.copy() #copiamos los datos para no trabajar directamente con los originales
df_products['product_id'] = df_products['product_id'].astype(str).str.rstrip().astype(int)
df_products['detail'] = df_products['detail'].str.rstrip()
df_products.head(5)

Unnamed: 0,product_id,detail,package
0,1,Kool Cola,1 Liter
1,2,Kool Cola,2 Liter
2,3,Kool Cola,500 cm3 can
3,4,Diet Cola,1 Liter
4,5,Diet Cola,2 Liter


In [163]:
df_products['old_package'] = df_products['package']
df_products['package'] = np.where(df_products['package'].str.contains('can')
                                  , 'Can'
                                  , 'Bottle')
df_products.head(2)

Unnamed: 0,product_id,detail,package,old_package
0,1,Kool Cola,Bottle,1 Liter
1,2,Kool Cola,Bottle,2 Liter


In [164]:
#creamos una funcion para calcular los litros que tiene cada producto
def transform_liters(list):
    if list[1] == 'Liter':
        return float(list[0])
    elif list[1] == 'cm3':
        return float(list[0])/1000
    else:
        np.nan

#y ahora la utilizamos
df_products['splitted'] = df_products['old_package'].str.split(' ')
df_products['litter'] = df_products['splitted'].apply(lambda x: transform_liters(x))
df_products.head(2)

Unnamed: 0,product_id,detail,package,old_package,splitted,litter
0,1,Kool Cola,Bottle,1 Liter,"[1, Liter]",1.0
1,2,Kool Cola,Bottle,2 Liter,"[2, Liter]",2.0


In [165]:
#ya no necesitamos la columnas que creamos recien llamadas old_package y splitted, entonces la borramos
df_products = df_products.drop(columns = ['old_package', 'splitted'])
df_products.head(2)

#Fin Transformacion de datos de los productos

Unnamed: 0,product_id,detail,package,litter
0,1,Kool Cola,Bottle,1.0
1,2,Kool Cola,Bottle,2.0


In [166]:
#Transformacion de los datos de las ventas

df_sales = history_sales_after_2008.copy()
df_sales['date'] = pd.to_datetime(df_sales['date'])
df_sales['timestamp'] = df_sales['date']
df_sales['date'] = pd.to_datetime(df_sales['date'].dt.date)
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358318 entries, 0 to 358317
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   billing_id   358318 non-null  int64         
 1   region       358318 non-null  object        
 2   branch_id    358318 non-null  int64         
 3   date         358318 non-null  datetime64[ns]
 4   customer_id  358318 non-null  int64         
 5   employee_id  358318 non-null  int64         
 6   product_id   317868 non-null  float64       
 7   quantity     317868 non-null  float64       
 8   timestamp    358318 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int64(4), object(1)
memory usage: 24.6+ MB


In [167]:
df_sales.head(2)

Unnamed: 0,billing_id,region,branch_id,date,customer_id,employee_id,product_id,quantity,timestamp
0,835920,South,1,2009-01-01,2298,242,2.0,5.0,2009-01-01 00:03:48
1,835920,South,1,2009-01-01,2298,242,5.0,11.0,2009-01-01 00:03:48


In [168]:
df_sales['product_id'] = df_sales['product_id'].fillna(-1).astype(int)
df_sales = df_sales[['date','billing_id','customer_id','product_id','quantity']]
df_sales.info()

#Fin Transformacion de los datos de las ventas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358318 entries, 0 to 358317
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   date         358318 non-null  datetime64[ns]
 1   billing_id   358318 non-null  int64         
 2   customer_id  358318 non-null  int64         
 3   product_id   358318 non-null  int32         
 4   quantity     317868 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int32(1), int64(2)
memory usage: 12.3 MB


# Etapa de PRESENTACION

### En esta etapa se van a relacionar los data frames realizados anteriormente conformando un modelo de datos (Data Mart)

In [169]:
#mergeamos las ventas con los productos (ambos tienen una columna para el id del producto)
df_sales = df_sales.merge(df_products, how='inner', on='product_id')
df_sales.head(2)

Unnamed: 0,date,billing_id,customer_id,product_id,quantity,detail,package,litter
0,2009-01-01,835920,2298,2,5.0,Kool Cola,Bottle,2.0
1,2009-01-01,835920,2298,5,11.0,Diet Cola,Bottle,2.0


In [170]:
#por cada venta, multiplicamos la cantidad del producto vendido por los litros del envase, 
# con el fin de obtener una nueva columna con los litros vendidos en cada detalle de venta. 

df_sales['liters_sold'] = df_sales['litter'] * df_sales['quantity']
df_sales.head(2)

Unnamed: 0,date,billing_id,customer_id,product_id,quantity,detail,package,litter,liters_sold
0,2009-01-01,835920,2298,2,5.0,Kool Cola,Bottle,2.0,10.0
1,2009-01-01,835920,2298,5,11.0,Diet Cola,Bottle,2.0,22.0


In [171]:
sales_overtime = (df_sales
                  .groupby(['date','customer_id'])
                  [['quantity','liters_sold']].sum()
                  .reset_index()
)
sales_overtime.head(2)

Unnamed: 0,date,customer_id,quantity,liters_sold
0,2009-01-01,1004,29.0,31.5
1,2009-01-01,1006,24.0,17.85


In [172]:
dim_date = pd.DataFrame(pd.date_range(start='2000-01-01',end='2009-12-31'), columns=['date'])
dim_date['week']      = dim_date['date'].dt.to_period('W').dt.start_time
dim_date['month']     = dim_date['date'].dt.to_period('M').dt.start_time
dim_date['year']      = dim_date['date'].dt.year
dim_date['day_name']  = dim_date['date'].dt.strftime('%A')
dim_date['week_num']  = dim_date['date'].dt.weekday
dim_date['month_num'] = dim_date['date'].dt.month
dim_date['day_month'] = dim_date['date'].dt.strftime('%d-%b')

dim_date.head(2)


Unnamed: 0,date,week,month,year,day_name,week_num,month_num,day_month
0,2000-01-01,1999-12-27,2000-01-01,2000,Saturday,5,1,01-Jan
1,2000-01-02,1999-12-27,2000-01-01,2000,Sunday,6,1,02-Jan


# FIN TUTORIAL ETL
## Hasta acá abordé lo referido a los primeros 2 requerimientos

### Requerimiento 3
#### Rankear los productos por zonas geográficas a través del tiempo

In [173]:
regions = pd.read_csv('Regions.txt', delimiter = '|', names = ['region','state','city','id'])
regions.head(2)

Unnamed: 0,region,state,city,id
0,West,California,San Jose,91520
1,West,California,Morgan Hill,95200


In [174]:
history_sales_after_2008.head(2)

Unnamed: 0,billing_id,region,branch_id,date,customer_id,employee_id,product_id,quantity
0,835920,South,1,2009-01-01 00:03:48,2298,242,2.0,5.0
1,835920,South,1,2009-01-01 00:03:48,2298,242,5.0,11.0


In [175]:
#copio los datos originales asi no los perjudico
df_sales_for_region = history_sales_after_2008.copy()
#convierto la columna product_id a integer
df_sales_for_region['product_id'] = df_sales_for_region['product_id'].fillna(-1).astype(int)
#mergeo con productos para obtener los litros de cada producto y asi obtener los litros vendidos en cada fila
df_sales_for_region = df_sales_for_region.merge(df_products, how='inner', on='product_id')
df_sales_for_region['liters_sold'] = df_sales_for_region['litter'] * df_sales_for_region['quantity']

#ranking con la suma de productos vendidos por region
rankingCantProductsForRegion = (df_sales_for_region
                                .groupby(['region'])
                                [['quantity']].sum()
                                .reset_index()
)
rankingCantProductsForRegion.head(4)

Unnamed: 0,region,quantity
0,East,851937.0
1,North,869768.0
2,South,1760533.0
3,West,873233.0


In [176]:
#ranking hecho con la suma de litros vendidos por region
rankingCantLittersForRegion = (df_sales_for_region
                                .groupby(['region'])
                                [['liters_sold']].sum()
                                .reset_index()
)
rankingCantLittersForRegion.head(4)

Unnamed: 0,region,liters_sold
0,East,953937.42
1,North,971752.48
2,South,1961191.7
3,West,976236.91


### Requerimiento 4
#### El gerente de Marketing desea preparar una promoción de importantes descuentos en las bebidas tipo Energy Drink para promocionar este tipo de bebidas en los eventos deportivos a producirse en los meses de setiembre, porque piensa que coincide con una etapa de picos en el monto de ventas dentro del año. Es correcta esta afirmación?

In [177]:
#similar a history_sales_after_2008 pero este tiene info sobre los precios
sales_prices = consulta_query(f"""
    SELECT b.BILLING_ID as billing_id, b.REGION as region,
           b.BRANCH_ID as branch_id,
           b.DATE as date,
           b.CUSTOMER_ID as customer_id,
           b.EMPLOYEE_ID as employee_id,
           bd.PRODUCT_ID as product_id,
           bd.QUANTITY as quantity,
           p.DATE as date_price,
           p.PRICE as price
FROM sales.billing b
INNER JOIN sales.billing_detail bd
        ON b.BILLING_ID = bd.BILLING_ID
INNER JOIN sales.prices p
		ON p.PRODUCT_ID = bd.PRODUCT_ID;
""")
sales_prices.head(2)

Unnamed: 0,billing_id,region,branch_id,date,customer_id,employee_id,product_id,quantity,date_price,price
0,835920,South,1,2009-01-01 00:03:48,2298,242,2,5,2006-02-01 09:00:50,1.5
1,835920,South,1,2009-01-01 00:03:48,2298,242,2,5,2007-01-04 10:00:50,1.58


In [178]:
#copio los datos originales asi no los perjudico
df_sales_prices = sales_prices.copy()
#convierto la columna product_id a integer
df_sales_prices['product_id'] = df_sales_prices['product_id'].fillna(-1).astype(int)
#mergeo con productos para obtener los litros de cada producto y asi obtener los litros vendidos en cada fila
df_sales_prices_products = df_sales_prices.copy()
df_sales_prices_products = df_sales_prices_products.merge(df_products, how='inner', on='product_id')
df_sales_prices_products['liters_sold'] = df_sales_prices_products['litter'] * df_sales_prices_products['quantity']

In [179]:
df_sales_prices_products.head(2)

Unnamed: 0,billing_id,region,branch_id,date,customer_id,employee_id,product_id,quantity,date_price,price,detail,package,litter,liters_sold
0,835920,South,1,2009-01-01 00:03:48,2298,242,2,5,2006-02-01 09:00:50,1.5,Kool Cola,Bottle,2.0,10.0
1,835920,South,1,2009-01-01 00:03:48,2298,242,2,5,2007-01-04 10:00:50,1.58,Kool Cola,Bottle,2.0,10.0


In [180]:
# Filtrar por el tipo de bebida "Energy drink"
df_energy_drink = df_sales_prices_products[df_sales_prices_products['detail'] == 'Energy drink']
df_energy_drink['quantity_X_price'] = df_energy_drink['quantity'] * df_energy_drink['price']

df_energy_drink['year_month'] = df_energy_drink['date'].dt.to_period('M')
df_energy_drink_sum = (df_energy_drink
              .groupby(['year_month'])
              [['quantity_X_price']].sum()
              .reset_index()
)
df_energy_drink_sum.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_energy_drink['quantity_X_price'] = df_energy_drink['quantity'] * df_energy_drink['price']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_energy_drink['year_month'] = df_energy_drink['date'].dt.to_period('M')


Unnamed: 0,year_month,quantity_X_price
0,2009-01,248144.85
1,2009-02,230283.25
2,2009-03,126818.26
3,2009-04,212098.11
4,2009-05,191205.63
5,2009-06,228798.2
6,2009-07,16363.13
7,2009-08,152178.55


### Requerimiento 5
#### El gerente de Marketing también quiere saber cómo es la relación entre las edades y los tipos de bebida, teniendo en cuenta la cantidad de litros vendidos. Es importante el tipo de bebida en la determinación de los grupos etarios?

In [181]:
df_tipos_bebida = (df_sales_for_region
              .groupby(df_sales_for_region['detail'])
              [['quantity']].sum()
              .reset_index()
)
df_tipos_bebida.head(2)

Unnamed: 0,detail,quantity
0,Birch Beer,310216.0
1,Caffeine Free Cola,309887.0


In [182]:

#necesito una nueva tabla para los clientes donde esten todas las columnas
#entonces voy a repetir el mismo proceso que hice antes para df_customer

df_customer_full = pd.concat([df_wholesale,df_retail]).reset_index(drop=True)
df_customer_full['CUSTOMER_ID'] = df_customer_full['CUSTOMER_ID'].astype(int)
columns = df_customer_full.columns.tolist()
for col in columns:
    df_customer_full = df_customer_full.rename(columns={f'{col}':f'{col.lower()}'})

df_sales_customer = df_sales_for_region.copy()
df_sales_customer = df_sales_customer.merge(df_customer_full, how = 'inner', on = 'customer_id')

# me aseguro que las columnas esten en el formato datetime
df_sales_customer['date'] = pd.to_datetime(df_sales_customer['date'])
df_sales_customer['birth_date'] = pd.to_datetime(df_sales_customer['birth_date'], dayfirst=True, errors='coerce')

# función para calcular la edad
def calculate_age(row):
    if pd.isnull(row['date']) or pd.isnull(row['birth_date']):
        return None
    return row['date'].year - row['birth_date'].year - ((row['date'].month, row['date'].day) < (row['birth_date'].month, row['birth_date'].day))

# aplico la función para crear la nueva columna 'age'
df_sales_customer['age'] = df_sales_customer.apply(calculate_age, axis=1)
df_sales_customer['age'] = df_sales_customer['age'].fillna(0).astype(int)

df_customer_x_tipo_bebida = (df_sales_customer
                             .groupby(['detail', 'age'])
                             [['liters_sold']].sum()
                             .reset_index()
)
df_customer_x_tipo_bebida.head(2)

Unnamed: 0,detail,age,liters_sold
0,Birch Beer,0,227853.0
1,Birch Beer,28,12.0


### Requerimiento 6
#### El gerente de RRHH necesita saber si la edad y el sexo del empleado tienen relación con el monto de ventas.

In [183]:
#history_sales_after_2008.head(20)
employee = pd.read_excel('Employee.xls',names=['EMPLOYEE_ID','FULL_NAME','CATEGORY','EMPLOYMENT_DATE', 'BIRTH_DATE','EDUCATION_LEVEL', 'GENDER'])
employee.head(2)


Unnamed: 0,EMPLOYEE_ID,FULL_NAME,CATEGORY,EMPLOYMENT_DATE,BIRTH_DATE,EDUCATION_LEVEL,GENDER
0,1,"Nowmer, Sheri",President,2001-02-01 00:00:00,12/21/1955,Graduate Degree,F
1,2,"Whelply, Derrick",VP Country Manager,03/25/2002,05/30/1975,Graduate Degree,M


In [184]:
#copiamos los datos de employee para no tocar los originales
df_employee = employee.copy()
#paso a minuscula las columnas de la tabla
columns = df_employee.columns.tolist()
for col in columns:
    df_employee = df_employee.rename(columns={f'{col}':f'{col.lower()}'})
df_employee.head(2)

Unnamed: 0,employee_id,full_name,category,employment_date,birth_date,education_level,gender
0,1,"Nowmer, Sheri",President,2001-02-01 00:00:00,12/21/1955,Graduate Degree,F
1,2,"Whelply, Derrick",VP Country Manager,03/25/2002,05/30/1975,Graduate Degree,M


In [185]:
from datetime import datetime
#esta en formato datetime?
df_employee['birth_date'] = pd.to_datetime(df_employee['birth_date'], format='%Y-%m-%d', errors='coerce')
# Función para calcular la edad
def calculate_age(birth_date):
    #today = datetime.today()
    return 2009 - birth_date.year - ((12, 31) < (birth_date.month, birth_date.day))
# Aplicar la función para crear la nueva columna 'age'
df_employee['age'] = df_employee['birth_date'].apply(calculate_age)
df_employee.head(2)

Unnamed: 0,employee_id,full_name,category,employment_date,birth_date,education_level,gender,age
0,1,"Nowmer, Sheri",President,2001-02-01 00:00:00,NaT,Graduate Degree,F,
1,2,"Whelply, Derrick",VP Country Manager,03/25/2002,NaT,Graduate Degree,M,


In [186]:
df_sales_prices.head(2)


Unnamed: 0,billing_id,region,branch_id,date,customer_id,employee_id,product_id,quantity,date_price,price
0,835920,South,1,2009-01-01 00:03:48,2298,242,2,5,2006-02-01 09:00:50,1.5
1,835920,South,1,2009-01-01 00:03:48,2298,242,2,5,2007-01-04 10:00:50,1.58


In [187]:
df_sales_prices_employee = df_sales_prices.copy()
df_sales_prices_employee = df_sales_prices_employee.merge(df_employee, how = 'inner', on = 'employee_id')
df_sales_prices_employee['quantity_X_price'] = df_sales_prices_employee['quantity'] * df_sales_prices_employee['price']
df_sales_prices_employee[['billing_id','employee_id','full_name','gender','age','quantity','price','quantity_X_price']].head(2)


Unnamed: 0,billing_id,employee_id,full_name,gender,age,quantity,price,quantity_X_price
0,835920,242,"McMenama, Betty",M,,5,1.5,7.5
1,835920,242,"McMenama, Betty",M,,5,1.58,7.9


In [188]:
df_sales_prices_employee_final = (df_sales_prices_employee
                             .groupby(['gender', 'age'])
                             [['quantity_X_price']].sum()
                             .reset_index()
)
df_sales_prices_employee_final.head(20)

Unnamed: 0,gender,age,quantity_X_price
